Extracting collocations in Python

Author

Masaki EGUCHI, Ph.D.

Modified

July 29, 2025

In this notebook, I will demonstrate how to extract collocations from a corpus.

Preparation

Let’s load necessary package.

Show code
# Load packages

import spacy
import glob
import pandas as pd

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

Let’s specify the corpus path

Show code
CORPUS_FILES = glob.glob("../../../corpus_data/brown_single/*.txt")

Define extraction codes

Show code

# Define dependency relations for collocation extraction
# Common patterns: adj-noun, verb-object, verb-adverb, noun-prep-noun
COLLOCATION_PATTERNS = [
    ("amod", "ADJ", "NOUN"),     # adjective modifier (e.g., "big house")
    ("dobj", "VERB", "NOUN"),    # direct object (e.g., "eat food")
    ("advmod", "VERB", "ADV"),   # adverb modifier (e.g., "run quickly")
    ("nmod", "NOUN", "NOUN"),    # noun modifier (e.g., "cup of tea")
    ("compound", "NOUN", "NOUN"), # compound nouns (e.g., "computer science")
]
Show code

def load_file(filepath):
    """Load text from file"""
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

def extract_collocations(doc, patterns=COLLOCATION_PATTERNS):
    """Extract collocations based on dependency patterns"""
    collocations = []
    
    for token in doc:
        for dep_rel, pos1, pos2 in patterns:
            # Check if token matches the dependency pattern
            if token.dep_ == dep_rel:
                if dep_rel in ["amod", "advmod", "compound"]:
                    # modifier comes before head
                    if token.pos_ == pos1 and token.head.pos_ == pos2:
                        collocations.append((token.lemma_.lower(), token.head.lemma_.lower(), dep_rel))
                else:  # dobj, nmod
                    # head comes before dependent
                    if token.head.pos_ == pos1 and token.pos_ == pos2:
                        collocations.append((token.head.lemma_.lower(), token.lemma_.lower(), dep_rel))
    
    return collocations

def update_results(results, doc, collocations):
    """Update results dictionary with frequency information"""
    # Update corpus size
    token_size = len(doc)
    results["corpus_size"] += token_size
    
    # Update unigram frequencies
    for token in doc:
        token_lower = token.lemma_.lower()
        results["unigram"][token_lower] = results["unigram"].get(token_lower, 0) + 1
    
    # Update bigram frequencies
    for word1, word2, dep_rel in collocations:
        bigram_key = f"{word1}_{word2}_{dep_rel}"
        if bigram_key not in results["bigram"]:
            results["bigram"][bigram_key] = {
                "word1": word1,
                "word2": word2,
                "dep_rel": dep_rel,
                "freq": 0
            }
        results["bigram"][bigram_key]["freq"] += 1
    
    return results

Now it’s time to run the code to extract collocations

Show code
# Main processing loop
results = {"corpus_size": 0,
           "unigram": {},
           "bigram": {}}

for file in CORPUS_FILES:  # Process first 5 files for testing
    # 1. Load the corpus file
    text = load_file(file)
    
    # 2. Parse and identify collocations
    doc = nlp(text)
    collocations = extract_collocations(doc)
    
    # 3. Update results
    results = update_results(results, doc, collocations)
    
    print(f"Processed: {file}")

print(f"\nCorpus size: {results['corpus_size']} tokens")
print(f"Unique words: {len(results['unigram'])}")
print(f"Unique collocations: {len(results['bigram'])}")
Processed: ../../../corpus_data/brown_single/cf_cf08.txt
Processed: ../../../corpus_data/brown_single/ck_ck17.txt
Processed: ../../../corpus_data/brown_single/cf_cf20.txt
Processed: ../../../corpus_data/brown_single/cf_cf34.txt
Processed: ../../../corpus_data/brown_single/ck_ck03.txt
Processed: ../../../corpus_data/brown_single/ca_ca37.txt
Processed: ../../../corpus_data/brown_single/cl_cl14.txt
Processed: ../../../corpus_data/brown_single/ca_ca23.txt
Processed: ../../../corpus_data/brown_single/ch_ch26.txt
Processed: ../../../corpus_data/brown_single/ce_ce11.txt
Processed: ../../../corpus_data/brown_single/ce_ce05.txt
Processed: ../../../corpus_data/brown_single/cb_cb06.txt
Processed: ../../../corpus_data/brown_single/cb_cb12.txt
Processed: ../../../corpus_data/brown_single/cp_cp25.txt
Processed: ../../../corpus_data/brown_single/cg_cg68.txt
Processed: ../../../corpus_data/brown_single/cg_cg40.txt
Processed: ../../../corpus_data/brown_single/cj_cj77.txt
Processed: ../../../corpus_data/brown_single/cj_cj63.txt
Processed: ../../../corpus_data/brown_single/cp_cp19.txt
Processed: ../../../corpus_data/brown_single/cg_cg54.txt
Processed: ../../../corpus_data/brown_single/cp_cp18.txt
Processed: ../../../corpus_data/brown_single/cj_cj62.txt
Processed: ../../../corpus_data/brown_single/cg_cg55.txt
Processed: ../../../corpus_data/brown_single/cg_cg41.txt
Processed: ../../../corpus_data/brown_single/cj_cj76.txt
Processed: ../../../corpus_data/brown_single/cp_cp24.txt
Processed: ../../../corpus_data/brown_single/cg_cg69.txt
Processed: ../../../corpus_data/brown_single/cb_cb13.txt
Processed: ../../../corpus_data/brown_single/cb_cb07.txt
Processed: ../../../corpus_data/brown_single/ce_ce04.txt
Processed: ../../../corpus_data/brown_single/ch_ch27.txt
Processed: ../../../corpus_data/brown_single/ce_ce10.txt
Processed: ../../../corpus_data/brown_single/cl_cl15.txt
Processed: ../../../corpus_data/brown_single/ca_ca22.txt
Processed: ../../../corpus_data/brown_single/ca_ca36.txt
Processed: ../../../corpus_data/brown_single/cl_cl01.txt
Processed: ../../../corpus_data/brown_single/cf_cf35.txt
Processed: ../../../corpus_data/brown_single/ck_ck02.txt
Processed: ../../../corpus_data/brown_single/ck_ck16.txt
Processed: ../../../corpus_data/brown_single/cf_cf21.txt
Processed: ../../../corpus_data/brown_single/cf_cf09.txt
Processed: ../../../corpus_data/brown_single/ck_ck28.txt
Processed: ../../../corpus_data/brown_single/cf_cf37.txt
Processed: ../../../corpus_data/brown_single/cf_cf23.txt
Processed: ../../../corpus_data/brown_single/ck_ck14.txt
Processed: ../../../corpus_data/brown_single/ca_ca20.txt
Processed: ../../../corpus_data/brown_single/cl_cl17.txt
Processed: ../../../corpus_data/brown_single/cl_cl03.txt
Processed: ../../../corpus_data/brown_single/ca_ca34.txt
Processed: ../../../corpus_data/brown_single/ca_ca08.txt
Processed: ../../../corpus_data/brown_single/ce_ce06.txt
Processed: ../../../corpus_data/brown_single/ce_ce12.txt
Processed: ../../../corpus_data/brown_single/ch_ch25.txt
Processed: ../../../corpus_data/brown_single/ch_ch19.txt
Processed: ../../../corpus_data/brown_single/cb_cb11.txt
Processed: ../../../corpus_data/brown_single/cb_cb05.txt
Processed: ../../../corpus_data/brown_single/cp_cp26.txt
Processed: ../../../corpus_data/brown_single/cj_cj48.txt
Processed: ../../../corpus_data/brown_single/cg_cg57.txt
Processed: ../../../corpus_data/brown_single/cj_cj60.txt
Processed: ../../../corpus_data/brown_single/cj_cj74.txt
Processed: ../../../corpus_data/brown_single/cg_cg43.txt
Processed: ../../../corpus_data/brown_single/cj_cj75.txt
Processed: ../../../corpus_data/brown_single/cg_cg42.txt
Processed: ../../../corpus_data/brown_single/cg_cg56.txt
Processed: ../../../corpus_data/brown_single/cj_cj61.txt
Processed: ../../../corpus_data/brown_single/cj_cj49.txt
Processed: ../../../corpus_data/brown_single/cp_cp27.txt
Processed: ../../../corpus_data/brown_single/cb_cb04.txt
Processed: ../../../corpus_data/brown_single/cb_cb10.txt
Processed: ../../../corpus_data/brown_single/ch_ch18.txt
Processed: ../../../corpus_data/brown_single/ce_ce13.txt
Processed: ../../../corpus_data/brown_single/ch_ch24.txt
Processed: ../../../corpus_data/brown_single/ch_ch30.txt
Processed: ../../../corpus_data/brown_single/ce_ce07.txt
Processed: ../../../corpus_data/brown_single/ca_ca09.txt
Processed: ../../../corpus_data/brown_single/cl_cl02.txt
Processed: ../../../corpus_data/brown_single/ca_ca35.txt
Processed: ../../../corpus_data/brown_single/ca_ca21.txt
Processed: ../../../corpus_data/brown_single/cl_cl16.txt
Processed: ../../../corpus_data/brown_single/cf_cf22.txt
Processed: ../../../corpus_data/brown_single/ck_ck15.txt
Processed: ../../../corpus_data/brown_single/ck_ck01.txt
Processed: ../../../corpus_data/brown_single/cf_cf36.txt
Processed: ../../../corpus_data/brown_single/ck_ck29.txt
Processed: ../../../corpus_data/brown_single/cf_cf32.txt
Processed: ../../../corpus_data/brown_single/ck_ck05.txt
Processed: ../../../corpus_data/brown_single/ck_ck11.txt
Processed: ../../../corpus_data/brown_single/cf_cf26.txt
Processed: ../../../corpus_data/brown_single/ca_ca19.txt
Processed: ../../../corpus_data/brown_single/cl_cl12.txt
Processed: ../../../corpus_data/brown_single/ca_ca25.txt
Processed: ../../../corpus_data/brown_single/ca_ca31.txt
Processed: ../../../corpus_data/brown_single/cl_cl06.txt
Processed: ../../../corpus_data/brown_single/ch_ch08.txt
Processed: ../../../corpus_data/brown_single/ce_ce03.txt
Processed: ../../../corpus_data/brown_single/ch_ch20.txt
Processed: ../../../corpus_data/brown_single/ce_ce17.txt
Processed: ../../../corpus_data/brown_single/cb_cb14.txt
Processed: ../../../corpus_data/brown_single/cj_cj65.txt
Processed: ../../../corpus_data/brown_single/cg_cg52.txt
Processed: ../../../corpus_data/brown_single/cg_cg46.txt
Processed: ../../../corpus_data/brown_single/cj_cj71.txt
Processed: ../../../corpus_data/brown_single/cp_cp23.txt
Processed: ../../../corpus_data/brown_single/cj_cj59.txt
Processed: ../../../corpus_data/brown_single/cj_cj58.txt
Processed: ../../../corpus_data/brown_single/cp_cp22.txt
Processed: ../../../corpus_data/brown_single/cg_cg47.txt
Processed: ../../../corpus_data/brown_single/cj_cj70.txt
Processed: ../../../corpus_data/brown_single/cj_cj64.txt
Processed: ../../../corpus_data/brown_single/cg_cg53.txt
Processed: ../../../corpus_data/brown_single/cb_cb01.txt
Processed: ../../../corpus_data/brown_single/cb_cb15.txt
Processed: ../../../corpus_data/brown_single/ch_ch21.txt
Processed: ../../../corpus_data/brown_single/ce_ce16.txt
Processed: ../../../corpus_data/brown_single/ce_ce02.txt
Processed: ../../../corpus_data/brown_single/ch_ch09.txt
Processed: ../../../corpus_data/brown_single/ca_ca30.txt
Processed: ../../../corpus_data/brown_single/cl_cl07.txt
Processed: ../../../corpus_data/brown_single/cl_cl13.txt
Processed: ../../../corpus_data/brown_single/ca_ca24.txt
Processed: ../../../corpus_data/brown_single/ca_ca18.txt
Processed: ../../../corpus_data/brown_single/ck_ck10.txt
Processed: ../../../corpus_data/brown_single/cf_cf27.txt
Processed: ../../../corpus_data/brown_single/cf_cf33.txt
Processed: ../../../corpus_data/brown_single/ck_ck04.txt
Processed: ../../../corpus_data/brown_single/cf_cf25.txt
Processed: ../../../corpus_data/brown_single/ck_ck12.txt
Processed: ../../../corpus_data/brown_single/ck_ck06.txt
Processed: ../../../corpus_data/brown_single/cf_cf31.txt
Processed: ../../../corpus_data/brown_single/cf_cf19.txt
Processed: ../../../corpus_data/brown_single/cl_cl05.txt
Processed: ../../../corpus_data/brown_single/ca_ca32.txt
Processed: ../../../corpus_data/brown_single/ca_ca26.txt
Processed: ../../../corpus_data/brown_single/cl_cl11.txt
Processed: ../../../corpus_data/brown_single/ce_ce28.txt
Processed: ../../../corpus_data/brown_single/ce_ce14.txt
Processed: ../../../corpus_data/brown_single/ch_ch23.txt
Processed: ../../../corpus_data/brown_single/cb_cb03.txt
Processed: ../../../corpus_data/brown_single/cb_cb17.txt
Processed: ../../../corpus_data/brown_single/cp_cp08.txt
Processed: ../../../corpus_data/brown_single/cj_cj72.txt
Processed: ../../../corpus_data/brown_single/cg_cg45.txt
Processed: ../../../corpus_data/brown_single/cg_cg51.txt
Processed: ../../../corpus_data/brown_single/cj_cj66.txt
Processed: ../../../corpus_data/brown_single/cp_cp20.txt
Processed: ../../../corpus_data/brown_single/cp_cp21.txt
Processed: ../../../corpus_data/brown_single/cg_cg50.txt
Processed: ../../../corpus_data/brown_single/cj_cj67.txt
Processed: ../../../corpus_data/brown_single/cj_cj73.txt
Processed: ../../../corpus_data/brown_single/cp_cp09.txt
Processed: ../../../corpus_data/brown_single/cg_cg44.txt
Processed: ../../../corpus_data/brown_single/cb_cb16.txt
Processed: ../../../corpus_data/brown_single/cb_cb02.txt
Processed: ../../../corpus_data/brown_single/ce_ce01.txt
Processed: ../../../corpus_data/brown_single/ce_ce15.txt
Processed: ../../../corpus_data/brown_single/ch_ch22.txt
Processed: ../../../corpus_data/brown_single/ce_ce29.txt
Processed: ../../../corpus_data/brown_single/ca_ca27.txt
Processed: ../../../corpus_data/brown_single/cl_cl10.txt
Processed: ../../../corpus_data/brown_single/cl_cl04.txt
Processed: ../../../corpus_data/brown_single/ca_ca33.txt
Processed: ../../../corpus_data/brown_single/cf_cf18.txt
Processed: ../../../corpus_data/brown_single/ck_ck07.txt
Processed: ../../../corpus_data/brown_single/cf_cf30.txt
Processed: ../../../corpus_data/brown_single/cf_cf24.txt
Processed: ../../../corpus_data/brown_single/ck_ck13.txt
Processed: ../../../corpus_data/brown_single/cc_cc05.txt
Processed: ../../../corpus_data/brown_single/cn_cn26.txt
Processed: ../../../corpus_data/brown_single/cc_cc11.txt
Processed: ../../../corpus_data/brown_single/cf_cf43.txt
Processed: ../../../corpus_data/brown_single/cd_cd12.txt
Processed: ../../../corpus_data/brown_single/cd_cd06.txt
Processed: ../../../corpus_data/brown_single/ca_ca40.txt
Processed: ../../../corpus_data/brown_single/cr_cr03.txt
Processed: ../../../corpus_data/brown_single/cm_cm03.txt
Processed: ../../../corpus_data/brown_single/cj_cj28.txt
Processed: ../../../corpus_data/brown_single/cj_cj14.txt
Processed: ../../../corpus_data/brown_single/cg_cg23.txt
Processed: ../../../corpus_data/brown_single/cg_cg37.txt
Processed: ../../../corpus_data/brown_single/cg_cg36.txt
Processed: ../../../corpus_data/brown_single/cj_cj01.txt
Processed: ../../../corpus_data/brown_single/cj_cj15.txt
Processed: ../../../corpus_data/brown_single/cg_cg22.txt
Processed: ../../../corpus_data/brown_single/cj_cj29.txt
Processed: ../../../corpus_data/brown_single/cm_cm02.txt
Processed: ../../../corpus_data/brown_single/cr_cr02.txt
Processed: ../../../corpus_data/brown_single/ca_ca41.txt
Processed: ../../../corpus_data/brown_single/cd_cd07.txt
Processed: ../../../corpus_data/brown_single/cd_cd13.txt
Processed: ../../../corpus_data/brown_single/cf_cf42.txt
Processed: ../../../corpus_data/brown_single/cn_cn27.txt
Processed: ../../../corpus_data/brown_single/cc_cc10.txt
Processed: ../../../corpus_data/brown_single/cc_cc04.txt
Processed: ../../../corpus_data/brown_single/cn_cn19.txt
Processed: ../../../corpus_data/brown_single/cc_cc12.txt
Processed: ../../../corpus_data/brown_single/cn_cn25.txt
Processed: ../../../corpus_data/brown_single/cc_cc06.txt
Processed: ../../../corpus_data/brown_single/cf_cf40.txt
Processed: ../../../corpus_data/brown_single/cd_cd05.txt
Processed: ../../../corpus_data/brown_single/cd_cd11.txt
Processed: ../../../corpus_data/brown_single/ca_ca43.txt
Processed: ../../../corpus_data/brown_single/cg_cg08.txt
Processed: ../../../corpus_data/brown_single/cj_cj03.txt
Processed: ../../../corpus_data/brown_single/cg_cg34.txt
Processed: ../../../corpus_data/brown_single/cg_cg20.txt
Processed: ../../../corpus_data/brown_single/cj_cj17.txt
Processed: ../../../corpus_data/brown_single/cg_cg21.txt
Processed: ../../../corpus_data/brown_single/cj_cj16.txt
Processed: ../../../corpus_data/brown_single/cj_cj02.txt
Processed: ../../../corpus_data/brown_single/cg_cg35.txt
Processed: ../../../corpus_data/brown_single/cg_cg09.txt
Processed: ../../../corpus_data/brown_single/cm_cm01.txt
Processed: ../../../corpus_data/brown_single/cr_cr01.txt
Processed: ../../../corpus_data/brown_single/ca_ca42.txt
Processed: ../../../corpus_data/brown_single/cd_cd10.txt
Processed: ../../../corpus_data/brown_single/cd_cd04.txt
Processed: ../../../corpus_data/brown_single/cf_cf41.txt
Processed: ../../../corpus_data/brown_single/cc_cc07.txt
Processed: ../../../corpus_data/brown_single/cc_cc13.txt
Processed: ../../../corpus_data/brown_single/cn_cn24.txt
Processed: ../../../corpus_data/brown_single/cn_cn18.txt
Processed: ../../../corpus_data/brown_single/cn_cn20.txt
Processed: ../../../corpus_data/brown_single/cc_cc17.txt
Processed: ../../../corpus_data/brown_single/cc_cc03.txt
Processed: ../../../corpus_data/brown_single/cn_cn08.txt
Processed: ../../../corpus_data/brown_single/cf_cf45.txt
Processed: ../../../corpus_data/brown_single/cd_cd14.txt
Processed: ../../../corpus_data/brown_single/cr_cr05.txt
Processed: ../../../corpus_data/brown_single/cm_cm05.txt
Processed: ../../../corpus_data/brown_single/cg_cg31.txt
Processed: ../../../corpus_data/brown_single/cj_cj06.txt
Processed: ../../../corpus_data/brown_single/cj_cj12.txt
Processed: ../../../corpus_data/brown_single/cg_cg25.txt
Processed: ../../../corpus_data/brown_single/cg_cg19.txt
Processed: ../../../corpus_data/brown_single/cg_cg18.txt
Processed: ../../../corpus_data/brown_single/cj_cj13.txt
Processed: ../../../corpus_data/brown_single/cg_cg24.txt
Processed: ../../../corpus_data/brown_single/cg_cg30.txt
Processed: ../../../corpus_data/brown_single/cj_cj07.txt
Processed: ../../../corpus_data/brown_single/cm_cm04.txt
Processed: ../../../corpus_data/brown_single/cr_cr04.txt
Processed: ../../../corpus_data/brown_single/cd_cd15.txt
Processed: ../../../corpus_data/brown_single/cd_cd01.txt
Processed: ../../../corpus_data/brown_single/cf_cf44.txt
Processed: ../../../corpus_data/brown_single/cn_cn09.txt
Processed: ../../../corpus_data/brown_single/cc_cc02.txt
Processed: ../../../corpus_data/brown_single/cn_cn21.txt
Processed: ../../../corpus_data/brown_single/cc_cc16.txt
Processed: ../../../corpus_data/brown_single/cc_cc14.txt
Processed: ../../../corpus_data/brown_single/cn_cn23.txt
Processed: ../../../corpus_data/brown_single/cf_cf46.txt
Processed: ../../../corpus_data/brown_single/cd_cd17.txt
Processed: ../../../corpus_data/brown_single/cd_cd03.txt
Processed: ../../../corpus_data/brown_single/cr_cr06.txt
Processed: ../../../corpus_data/brown_single/cm_cm06.txt
Processed: ../../../corpus_data/brown_single/cg_cg26.txt
Processed: ../../../corpus_data/brown_single/cj_cj11.txt
Processed: ../../../corpus_data/brown_single/cj_cj05.txt
Processed: ../../../corpus_data/brown_single/cg_cg32.txt
Processed: ../../../corpus_data/brown_single/cj_cj39.txt
Processed: ../../../corpus_data/brown_single/cj_cj38.txt
Processed: ../../../corpus_data/brown_single/cj_cj04.txt
Processed: ../../../corpus_data/brown_single/cg_cg33.txt
Processed: ../../../corpus_data/brown_single/cg_cg27.txt
Processed: ../../../corpus_data/brown_single/cj_cj10.txt
Processed: ../../../corpus_data/brown_single/cr_cr07.txt
Processed: ../../../corpus_data/brown_single/ca_ca44.txt
Processed: ../../../corpus_data/brown_single/cd_cd02.txt
Processed: ../../../corpus_data/brown_single/cd_cd16.txt
Processed: ../../../corpus_data/brown_single/cf_cf47.txt
Processed: ../../../corpus_data/brown_single/cc_cc15.txt
Processed: ../../../corpus_data/brown_single/cn_cn22.txt
Processed: ../../../corpus_data/brown_single/cc_cc01.txt
Processed: ../../../corpus_data/brown_single/cn_cn13.txt
Processed: ../../../corpus_data/brown_single/cn_cn07.txt
Processed: ../../../corpus_data/brown_single/cj_cj09.txt
Processed: ../../../corpus_data/brown_single/cj_cj35.txt
Processed: ../../../corpus_data/brown_single/cg_cg02.txt
Processed: ../../../corpus_data/brown_single/cg_cg16.txt
Processed: ../../../corpus_data/brown_single/cj_cj21.txt
Processed: ../../../corpus_data/brown_single/cg_cg17.txt
Processed: ../../../corpus_data/brown_single/cj_cj20.txt
Processed: ../../../corpus_data/brown_single/cj_cj34.txt
Processed: ../../../corpus_data/brown_single/cg_cg03.txt
Processed: ../../../corpus_data/brown_single/cj_cj08.txt
Processed: ../../../corpus_data/brown_single/cn_cn06.txt
Processed: ../../../corpus_data/brown_single/cn_cn12.txt
Processed: ../../../corpus_data/brown_single/cn_cn04.txt
Processed: ../../../corpus_data/brown_single/cn_cn10.txt
Processed: ../../../corpus_data/brown_single/cr_cr09.txt
Processed: ../../../corpus_data/brown_single/cg_cg29.txt
Processed: ../../../corpus_data/brown_single/cj_cj22.txt
Processed: ../../../corpus_data/brown_single/cg_cg15.txt
Processed: ../../../corpus_data/brown_single/cg_cg01.txt
Processed: ../../../corpus_data/brown_single/cj_cj36.txt
Processed: ../../../corpus_data/brown_single/cj_cj37.txt
Processed: ../../../corpus_data/brown_single/cj_cj23.txt
Processed: ../../../corpus_data/brown_single/cg_cg14.txt
Processed: ../../../corpus_data/brown_single/cg_cg28.txt
Processed: ../../../corpus_data/brown_single/cr_cr08.txt
Processed: ../../../corpus_data/brown_single/cf_cf48.txt
Processed: ../../../corpus_data/brown_single/cn_cn11.txt
Processed: ../../../corpus_data/brown_single/cn_cn05.txt
Processed: ../../../corpus_data/brown_single/cn_cn01.txt
Processed: ../../../corpus_data/brown_single/cn_cn15.txt
Processed: ../../../corpus_data/brown_single/cn_cn29.txt
Processed: ../../../corpus_data/brown_single/cd_cd09.txt
Processed: ../../../corpus_data/brown_single/cg_cg10.txt
Processed: ../../../corpus_data/brown_single/cj_cj27.txt
Processed: ../../../corpus_data/brown_single/cj_cj33.txt
Processed: ../../../corpus_data/brown_single/cg_cg04.txt
Processed: ../../../corpus_data/brown_single/cg_cg38.txt
Processed: ../../../corpus_data/brown_single/cg_cg39.txt
Processed: ../../../corpus_data/brown_single/cj_cj32.txt
Processed: ../../../corpus_data/brown_single/cg_cg05.txt
Processed: ../../../corpus_data/brown_single/cg_cg11.txt
Processed: ../../../corpus_data/brown_single/cj_cj26.txt
Processed: ../../../corpus_data/brown_single/cd_cd08.txt
Processed: ../../../corpus_data/brown_single/cn_cn28.txt
Processed: ../../../corpus_data/brown_single/cn_cn14.txt
Processed: ../../../corpus_data/brown_single/cn_cn16.txt
Processed: ../../../corpus_data/brown_single/cn_cn02.txt
Processed: ../../../corpus_data/brown_single/cc_cc09.txt
Processed: ../../../corpus_data/brown_single/cg_cg07.txt
Processed: ../../../corpus_data/brown_single/cj_cj30.txt
Processed: ../../../corpus_data/brown_single/cj_cj24.txt
Processed: ../../../corpus_data/brown_single/cg_cg13.txt
Processed: ../../../corpus_data/brown_single/cj_cj18.txt
Processed: ../../../corpus_data/brown_single/cj_cj19.txt
Processed: ../../../corpus_data/brown_single/cj_cj25.txt
Processed: ../../../corpus_data/brown_single/cg_cg12.txt
Processed: ../../../corpus_data/brown_single/cg_cg06.txt
Processed: ../../../corpus_data/brown_single/cj_cj31.txt
Processed: ../../../corpus_data/brown_single/cc_cc08.txt
Processed: ../../../corpus_data/brown_single/cn_cn03.txt
Processed: ../../../corpus_data/brown_single/cn_cn17.txt
Processed: ../../../corpus_data/brown_single/cf_cf29.txt
Processed: ../../../corpus_data/brown_single/cf_cf01.txt
Processed: ../../../corpus_data/brown_single/cf_cf15.txt
Processed: ../../../corpus_data/brown_single/ck_ck22.txt
Processed: ../../../corpus_data/brown_single/ca_ca16.txt
Processed: ../../../corpus_data/brown_single/cl_cl21.txt
Processed: ../../../corpus_data/brown_single/ca_ca02.txt
Processed: ../../../corpus_data/brown_single/cl_cl09.txt
Processed: ../../../corpus_data/brown_single/ch_ch07.txt
Processed: ../../../corpus_data/brown_single/ce_ce30.txt
Processed: ../../../corpus_data/brown_single/ce_ce24.txt
Processed: ../../../corpus_data/brown_single/ch_ch13.txt
Processed: ../../../corpus_data/brown_single/ce_ce18.txt
Processed: ../../../corpus_data/brown_single/cb_cb27.txt
Processed: ../../../corpus_data/brown_single/cp_cp10.txt
Processed: ../../../corpus_data/brown_single/cp_cp04.txt
Processed: ../../../corpus_data/brown_single/cg_cg49.txt
Processed: ../../../corpus_data/brown_single/cg_cg61.txt
Processed: ../../../corpus_data/brown_single/cj_cj56.txt
Processed: ../../../corpus_data/brown_single/cj_cj42.txt
Processed: ../../../corpus_data/brown_single/cg_cg75.txt
Processed: ../../../corpus_data/brown_single/cj_cj43.txt
Processed: ../../../corpus_data/brown_single/cg_cg74.txt
Processed: ../../../corpus_data/brown_single/cg_cg60.txt
Processed: ../../../corpus_data/brown_single/cj_cj57.txt
Processed: ../../../corpus_data/brown_single/cp_cp05.txt
Processed: ../../../corpus_data/brown_single/cg_cg48.txt
Processed: ../../../corpus_data/brown_single/cp_cp11.txt
Processed: ../../../corpus_data/brown_single/cj_cj80.txt
Processed: ../../../corpus_data/brown_single/cb_cb26.txt
Processed: ../../../corpus_data/brown_single/ce_ce19.txt
Processed: ../../../corpus_data/brown_single/ce_ce25.txt
Processed: ../../../corpus_data/brown_single/ch_ch12.txt
Processed: ../../../corpus_data/brown_single/ch_ch06.txt
Processed: ../../../corpus_data/brown_single/ce_ce31.txt
Processed: ../../../corpus_data/brown_single/cl_cl08.txt
Processed: ../../../corpus_data/brown_single/ca_ca03.txt
Processed: ../../../corpus_data/brown_single/ca_ca17.txt
Processed: ../../../corpus_data/brown_single/cl_cl20.txt
Processed: ../../../corpus_data/brown_single/cf_cf14.txt
Processed: ../../../corpus_data/brown_single/ck_ck23.txt
Processed: ../../../corpus_data/brown_single/cf_cf28.txt
Processed: ../../../corpus_data/brown_single/ck_ck09.txt
Processed: ../../../corpus_data/brown_single/ck_ck21.txt
Processed: ../../../corpus_data/brown_single/cf_cf16.txt
Processed: ../../../corpus_data/brown_single/cf_cf02.txt
Processed: ../../../corpus_data/brown_single/ca_ca01.txt
Processed: ../../../corpus_data/brown_single/cl_cl22.txt
Processed: ../../../corpus_data/brown_single/ca_ca15.txt
Processed: ../../../corpus_data/brown_single/ca_ca29.txt
Processed: ../../../corpus_data/brown_single/ch_ch10.txt
Processed: ../../../corpus_data/brown_single/ce_ce27.txt
Processed: ../../../corpus_data/brown_single/ce_ce33.txt
Processed: ../../../corpus_data/brown_single/ch_ch04.txt
Processed: ../../../corpus_data/brown_single/cb_cb18.txt
Processed: ../../../corpus_data/brown_single/cb_cb24.txt
Processed: ../../../corpus_data/brown_single/cp_cp07.txt
Processed: ../../../corpus_data/brown_single/cp_cp13.txt
Processed: ../../../corpus_data/brown_single/cj_cj69.txt
Processed: ../../../corpus_data/brown_single/cj_cj41.txt
Processed: ../../../corpus_data/brown_single/cj_cj55.txt
Processed: ../../../corpus_data/brown_single/cg_cg62.txt
Processed: ../../../corpus_data/brown_single/cj_cj54.txt
Processed: ../../../corpus_data/brown_single/cg_cg63.txt
Processed: ../../../corpus_data/brown_single/cj_cj40.txt
Processed: ../../../corpus_data/brown_single/cj_cj68.txt
Processed: ../../../corpus_data/brown_single/cp_cp12.txt
Processed: ../../../corpus_data/brown_single/cp_cp06.txt
Processed: ../../../corpus_data/brown_single/cb_cb25.txt
Processed: ../../../corpus_data/brown_single/cb_cb19.txt
Processed: ../../../corpus_data/brown_single/ce_ce32.txt
Processed: ../../../corpus_data/brown_single/ch_ch05.txt
Processed: ../../../corpus_data/brown_single/ch_ch11.txt
Processed: ../../../corpus_data/brown_single/ce_ce26.txt
Processed: ../../../corpus_data/brown_single/ca_ca28.txt
Processed: ../../../corpus_data/brown_single/cl_cl23.txt
Processed: ../../../corpus_data/brown_single/ca_ca14.txt
Processed: ../../../corpus_data/brown_single/cf_cf03.txt
Processed: ../../../corpus_data/brown_single/ck_ck20.txt
Processed: ../../../corpus_data/brown_single/cf_cf17.txt
Processed: ../../../corpus_data/brown_single/ck_ck08.txt
Processed: ../../../corpus_data/brown_single/cf_cf13.txt
Processed: ../../../corpus_data/brown_single/ck_ck24.txt
Processed: ../../../corpus_data/brown_single/cf_cf07.txt
Processed: ../../../corpus_data/brown_single/ck_ck18.txt
Processed: ../../../corpus_data/brown_single/ca_ca38.txt
Processed: ../../../corpus_data/brown_single/ca_ca04.txt
Processed: ../../../corpus_data/brown_single/ca_ca10.txt
Processed: ../../../corpus_data/brown_single/ch_ch29.txt
Processed: ../../../corpus_data/brown_single/ce_ce22.txt
Processed: ../../../corpus_data/brown_single/ch_ch15.txt
Processed: ../../../corpus_data/brown_single/ch_ch01.txt
Processed: ../../../corpus_data/brown_single/ce_ce36.txt
Processed: ../../../corpus_data/brown_single/cb_cb21.txt
Processed: ../../../corpus_data/brown_single/cb_cb09.txt
Processed: ../../../corpus_data/brown_single/cj_cj44.txt
Processed: ../../../corpus_data/brown_single/cg_cg73.txt
Processed: ../../../corpus_data/brown_single/cg_cg67.txt
Processed: ../../../corpus_data/brown_single/cj_cj50.txt
Processed: ../../../corpus_data/brown_single/cj_cj78.txt
Processed: ../../../corpus_data/brown_single/cp_cp02.txt
Processed: ../../../corpus_data/brown_single/cp_cp16.txt
Processed: ../../../corpus_data/brown_single/cp_cp17.txt
Processed: ../../../corpus_data/brown_single/cp_cp03.txt
Processed: ../../../corpus_data/brown_single/cj_cj79.txt
Processed: ../../../corpus_data/brown_single/cg_cg66.txt
Processed: ../../../corpus_data/brown_single/cj_cj51.txt
Processed: ../../../corpus_data/brown_single/cj_cj45.txt
Processed: ../../../corpus_data/brown_single/cg_cg72.txt
Processed: ../../../corpus_data/brown_single/cb_cb08.txt
Processed: ../../../corpus_data/brown_single/cb_cb20.txt
Processed: ../../../corpus_data/brown_single/ce_ce23.txt
Processed: ../../../corpus_data/brown_single/ch_ch14.txt
Processed: ../../../corpus_data/brown_single/ch_ch28.txt
Processed: ../../../corpus_data/brown_single/ca_ca11.txt
Processed: ../../../corpus_data/brown_single/ca_ca05.txt
Processed: ../../../corpus_data/brown_single/ca_ca39.txt
Processed: ../../../corpus_data/brown_single/ck_ck19.txt
Processed: ../../../corpus_data/brown_single/cf_cf06.txt
Processed: ../../../corpus_data/brown_single/cf_cf12.txt
Processed: ../../../corpus_data/brown_single/ck_ck25.txt
Processed: ../../../corpus_data/brown_single/cf_cf04.txt
Processed: ../../../corpus_data/brown_single/ck_ck27.txt
Processed: ../../../corpus_data/brown_single/cf_cf10.txt
Processed: ../../../corpus_data/brown_single/cf_cf38.txt
Processed: ../../../corpus_data/brown_single/cl_cl18.txt
Processed: ../../../corpus_data/brown_single/cl_cl24.txt
Processed: ../../../corpus_data/brown_single/ca_ca13.txt
Processed: ../../../corpus_data/brown_single/ca_ca07.txt
Processed: ../../../corpus_data/brown_single/ce_ce09.txt
Processed: ../../../corpus_data/brown_single/ce_ce35.txt
Processed: ../../../corpus_data/brown_single/ch_ch02.txt
Processed: ../../../corpus_data/brown_single/ch_ch16.txt
Processed: ../../../corpus_data/brown_single/ce_ce21.txt
Processed: ../../../corpus_data/brown_single/cb_cb22.txt
Processed: ../../../corpus_data/brown_single/cj_cj53.txt
Processed: ../../../corpus_data/brown_single/cp_cp29.txt
Processed: ../../../corpus_data/brown_single/cg_cg64.txt
Processed: ../../../corpus_data/brown_single/cg_cg70.txt
Processed: ../../../corpus_data/brown_single/cj_cj47.txt
Processed: ../../../corpus_data/brown_single/cp_cp15.txt
Processed: ../../../corpus_data/brown_single/cg_cg58.txt
Processed: ../../../corpus_data/brown_single/cp_cp01.txt
Processed: ../../../corpus_data/brown_single/cp_cp14.txt
Processed: ../../../corpus_data/brown_single/cg_cg59.txt
Processed: ../../../corpus_data/brown_single/cg_cg71.txt
Processed: ../../../corpus_data/brown_single/cj_cj46.txt
Processed: ../../../corpus_data/brown_single/cp_cp28.txt
Processed: ../../../corpus_data/brown_single/cj_cj52.txt
Processed: ../../../corpus_data/brown_single/cg_cg65.txt
Processed: ../../../corpus_data/brown_single/cb_cb23.txt
Processed: ../../../corpus_data/brown_single/ch_ch17.txt
Processed: ../../../corpus_data/brown_single/ce_ce20.txt
Processed: ../../../corpus_data/brown_single/ce_ce34.txt
Processed: ../../../corpus_data/brown_single/ch_ch03.txt
Processed: ../../../corpus_data/brown_single/ce_ce08.txt
Processed: ../../../corpus_data/brown_single/ca_ca06.txt
Processed: ../../../corpus_data/brown_single/ca_ca12.txt
Processed: ../../../corpus_data/brown_single/cl_cl19.txt
Processed: ../../../corpus_data/brown_single/cf_cf39.txt
Processed: ../../../corpus_data/brown_single/ck_ck26.txt
Processed: ../../../corpus_data/brown_single/cf_cf11.txt
Processed: ../../../corpus_data/brown_single/cf_cf05.txt

Corpus size: 1242331 tokens
Unique words: 34947
Unique collocations: 81932

Transform the data into dataset format

In the code above, the collocation has been extracted, but the data format is not human readable. In the following I will transform the data to other format.

Show code
def results_to_dataframe(results, min_freq=1):
    """
    Convert results dictionary to pandas DataFrame with additional options
    
    Parameters:
    - results: dictionary with corpus_size, unigram, and bigram data
    - min_freq: minimum collocation frequency to include (default: 1)
    - include_dep_rel: whether to include dependency relation in output (default: True)
    """
    rows = []
    
    for bigram_key, bigram_info in results["bigram"].items():
        # Skip if below minimum frequency
        if bigram_info['freq'] < min_freq:
            continue
            
        row = {
            "collocation": f"{bigram_info['word1']}_{bigram_info['word2']}",
            "word1": bigram_info['word1'],
            "word2": bigram_info['word2'],
            "dep_relation": bigram_info['dep_rel'],
            "corpus_size": results['corpus_size'],
            "collocation_frequency": bigram_info['freq'],
            "word1_freq": results["unigram"].get(bigram_info['word1'], 0),
            "word2_freq": results["unigram"].get(bigram_info['word2'], 0)
        }
        
        
        rows.append(row)
    
    # Create DataFrame and sort by collocation frequency
    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values('collocation_frequency', ascending=False)
        df = df.reset_index(drop=True)
    
    return df
Show code



# Example with filtering
df_collocations = results_to_dataframe(results, min_freq=5)

# Display first 20 rows
print("Top 20 collocations:")
print(df_collocations.head(20))

# Show DataFrame info
print(f"\nTotal collocations found: {len(df_collocations)}")
print(f"\nDataFrame columns: {list(df_collocations.columns)}")
Top 20 collocations:
           collocation       word1  ... word1_freq word2_freq
0            last_year        last  ...        705       1789
1            same_time        same  ...        686       1970
2            young_man       young  ...        442       2149
3           take_place        take  ...       1585        836
4          fiscal_year      fiscal  ...        120       1789
5          high_school        high  ...        801        711
6              old_man         old  ...        857       2149
7           first_time       first  ...       1388       1970
8           last_night        last  ...        705        468
9           other_hand       other  ...       2032        801
10            per_cent         per  ...        380        194
11           last_week        last  ...        705        450
12        middle_class      middle  ...        178        352
13  nineteenth_century  nineteenth  ...         58        307
14           have_time        have  ...      11908       1970
15           long_time        long  ...       1095       1970
16            few_year         few  ...        631       1789
17          index_word       index  ...         90        558
18          long_range        long  ...       1095        264
19          great_deal       great  ...        940        223

[20 rows x 8 columns]

Total collocations found: 1844

DataFrame columns: ['collocation', 'word1', 'word2', 'dep_relation', 'corpus_size', 'collocation_frequency', 'word1_freq', 'word2_freq']

Saving the data into tsv file

Show code
# Index = row number usually starts with zero. Adding one may help for human reader
df_collocations.index += 1
Show code
df_collocations.to_csv("../../../corpus_data/brown_collocations.tsv", sep = "\t",
                       index=True,
                       index_label="rank")

Saving randomly sampled data

Now because the data is still big, let’s filter them down to manageable size.

First, we should filter it with frequency. Let’s retain collocation that occur more than 10 times.

Second, let’s randomly sample 50 from this pool.

Show code
df_collocations.query('collocation_frequency > 10')
collocation word1 word2 dep_relation corpus_size collocation_frequency word1_freq word2_freq
1 last_year last year amod 1242331 123 705 1789
2 same_time same time amod 1242331 94 686 1970
3 young_man young man amod 1242331 89 442 2149
4 take_place take place dobj 1242331 84 1585 836
5 fiscal_year fiscal year amod 1242331 76 120 1789
... ... ... ... ... ... ... ... ...
375 mystery_story mystery story compound 1242331 11 47 234
376 get_job get job dobj 1242331 11 1460 308
377 high_cost high cost amod 1242331 11 801 431
378 onset_age onset age compound 1242331 11 43 284
379 far_end far end amod 1242331 11 581 583

379 rows × 8 columns

Show code
df_collocations.query('collocation_frequency > 10').sample(n = 50, random_state=42).to_csv("../../../corpus_data/brown_collocations_random_50.tsv", sep = "\t",
                       index=True,
                       index_label="rank")