Show code
# Load packages
import spacy
import glob
import pandas as pd
# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")Masaki EGUCHI, Ph.D.
July 29, 2025
In this notebook, I will demonstrate how to extract collocations from a corpus.
# Define dependency relations for collocation extraction
# Common patterns: adj-noun, verb-object, verb-adverb, noun-prep-noun
COLLOCATION_PATTERNS = [
("amod", "ADJ", "NOUN"), # adjective modifier (e.g., "big house")
("dobj", "VERB", "NOUN"), # direct object (e.g., "eat food")
("advmod", "VERB", "ADV"), # adverb modifier (e.g., "run quickly")
("nmod", "NOUN", "NOUN"), # noun modifier (e.g., "cup of tea")
("compound", "NOUN", "NOUN"), # compound nouns (e.g., "computer science")
]
def load_file(filepath):
"""Load text from file"""
with open(filepath, 'r', encoding='utf-8') as f:
return f.read()
def extract_collocations(doc, patterns=COLLOCATION_PATTERNS):
"""Extract collocations based on dependency patterns"""
collocations = []
for token in doc:
for dep_rel, pos1, pos2 in patterns:
# Check if token matches the dependency pattern
if token.dep_ == dep_rel:
if dep_rel in ["amod", "advmod", "compound"]:
# modifier comes before head
if token.pos_ == pos1 and token.head.pos_ == pos2:
collocations.append((token.lemma_.lower(), token.head.lemma_.lower(), dep_rel))
else: # dobj, nmod
# head comes before dependent
if token.head.pos_ == pos1 and token.pos_ == pos2:
collocations.append((token.head.lemma_.lower(), token.lemma_.lower(), dep_rel))
return collocations
def update_results(results, doc, collocations):
"""Update results dictionary with frequency information"""
# Update corpus size
token_size = len(doc)
results["corpus_size"] += token_size
# Update unigram frequencies
for token in doc:
token_lower = token.lemma_.lower()
results["unigram"][token_lower] = results["unigram"].get(token_lower, 0) + 1
# Update bigram frequencies
for word1, word2, dep_rel in collocations:
bigram_key = f"{word1}_{word2}_{dep_rel}"
if bigram_key not in results["bigram"]:
results["bigram"][bigram_key] = {
"word1": word1,
"word2": word2,
"dep_rel": dep_rel,
"freq": 0
}
results["bigram"][bigram_key]["freq"] += 1
return results
# Main processing loop
results = {"corpus_size": 0,
"unigram": {},
"bigram": {}}
for file in CORPUS_FILES: # Process first 5 files for testing
# 1. Load the corpus file
text = load_file(file)
# 2. Parse and identify collocations
doc = nlp(text)
collocations = extract_collocations(doc)
# 3. Update results
results = update_results(results, doc, collocations)
print(f"Processed: {file}")
print(f"\nCorpus size: {results['corpus_size']} tokens")
print(f"Unique words: {len(results['unigram'])}")
print(f"Unique collocations: {len(results['bigram'])}")Processed: ../../../corpus_data/brown_single/cf_cf08.txt
Processed: ../../../corpus_data/brown_single/ck_ck17.txt
Processed: ../../../corpus_data/brown_single/cf_cf20.txt
Processed: ../../../corpus_data/brown_single/cf_cf34.txt
Processed: ../../../corpus_data/brown_single/ck_ck03.txt
Processed: ../../../corpus_data/brown_single/ca_ca37.txt
Processed: ../../../corpus_data/brown_single/cl_cl14.txt
Processed: ../../../corpus_data/brown_single/ca_ca23.txt
Processed: ../../../corpus_data/brown_single/ch_ch26.txt
Processed: ../../../corpus_data/brown_single/ce_ce11.txt
Processed: ../../../corpus_data/brown_single/ce_ce05.txt
Processed: ../../../corpus_data/brown_single/cb_cb06.txt
Processed: ../../../corpus_data/brown_single/cb_cb12.txt
Processed: ../../../corpus_data/brown_single/cp_cp25.txt
Processed: ../../../corpus_data/brown_single/cg_cg68.txt
Processed: ../../../corpus_data/brown_single/cg_cg40.txt
Processed: ../../../corpus_data/brown_single/cj_cj77.txt
Processed: ../../../corpus_data/brown_single/cj_cj63.txt
Processed: ../../../corpus_data/brown_single/cp_cp19.txt
Processed: ../../../corpus_data/brown_single/cg_cg54.txt
Processed: ../../../corpus_data/brown_single/cp_cp18.txt
Processed: ../../../corpus_data/brown_single/cj_cj62.txt
Processed: ../../../corpus_data/brown_single/cg_cg55.txt
Processed: ../../../corpus_data/brown_single/cg_cg41.txt
Processed: ../../../corpus_data/brown_single/cj_cj76.txt
Processed: ../../../corpus_data/brown_single/cp_cp24.txt
Processed: ../../../corpus_data/brown_single/cg_cg69.txt
Processed: ../../../corpus_data/brown_single/cb_cb13.txt
Processed: ../../../corpus_data/brown_single/cb_cb07.txt
Processed: ../../../corpus_data/brown_single/ce_ce04.txt
Processed: ../../../corpus_data/brown_single/ch_ch27.txt
Processed: ../../../corpus_data/brown_single/ce_ce10.txt
Processed: ../../../corpus_data/brown_single/cl_cl15.txt
Processed: ../../../corpus_data/brown_single/ca_ca22.txt
Processed: ../../../corpus_data/brown_single/ca_ca36.txt
Processed: ../../../corpus_data/brown_single/cl_cl01.txt
Processed: ../../../corpus_data/brown_single/cf_cf35.txt
Processed: ../../../corpus_data/brown_single/ck_ck02.txt
Processed: ../../../corpus_data/brown_single/ck_ck16.txt
Processed: ../../../corpus_data/brown_single/cf_cf21.txt
Processed: ../../../corpus_data/brown_single/cf_cf09.txt
Processed: ../../../corpus_data/brown_single/ck_ck28.txt
Processed: ../../../corpus_data/brown_single/cf_cf37.txt
Processed: ../../../corpus_data/brown_single/cf_cf23.txt
Processed: ../../../corpus_data/brown_single/ck_ck14.txt
Processed: ../../../corpus_data/brown_single/ca_ca20.txt
Processed: ../../../corpus_data/brown_single/cl_cl17.txt
Processed: ../../../corpus_data/brown_single/cl_cl03.txt
Processed: ../../../corpus_data/brown_single/ca_ca34.txt
Processed: ../../../corpus_data/brown_single/ca_ca08.txt
Processed: ../../../corpus_data/brown_single/ce_ce06.txt
Processed: ../../../corpus_data/brown_single/ce_ce12.txt
Processed: ../../../corpus_data/brown_single/ch_ch25.txt
Processed: ../../../corpus_data/brown_single/ch_ch19.txt
Processed: ../../../corpus_data/brown_single/cb_cb11.txt
Processed: ../../../corpus_data/brown_single/cb_cb05.txt
Processed: ../../../corpus_data/brown_single/cp_cp26.txt
Processed: ../../../corpus_data/brown_single/cj_cj48.txt
Processed: ../../../corpus_data/brown_single/cg_cg57.txt
Processed: ../../../corpus_data/brown_single/cj_cj60.txt
Processed: ../../../corpus_data/brown_single/cj_cj74.txt
Processed: ../../../corpus_data/brown_single/cg_cg43.txt
Processed: ../../../corpus_data/brown_single/cj_cj75.txt
Processed: ../../../corpus_data/brown_single/cg_cg42.txt
Processed: ../../../corpus_data/brown_single/cg_cg56.txt
Processed: ../../../corpus_data/brown_single/cj_cj61.txt
Processed: ../../../corpus_data/brown_single/cj_cj49.txt
Processed: ../../../corpus_data/brown_single/cp_cp27.txt
Processed: ../../../corpus_data/brown_single/cb_cb04.txt
Processed: ../../../corpus_data/brown_single/cb_cb10.txt
Processed: ../../../corpus_data/brown_single/ch_ch18.txt
Processed: ../../../corpus_data/brown_single/ce_ce13.txt
Processed: ../../../corpus_data/brown_single/ch_ch24.txt
Processed: ../../../corpus_data/brown_single/ch_ch30.txt
Processed: ../../../corpus_data/brown_single/ce_ce07.txt
Processed: ../../../corpus_data/brown_single/ca_ca09.txt
Processed: ../../../corpus_data/brown_single/cl_cl02.txt
Processed: ../../../corpus_data/brown_single/ca_ca35.txt
Processed: ../../../corpus_data/brown_single/ca_ca21.txt
Processed: ../../../corpus_data/brown_single/cl_cl16.txt
Processed: ../../../corpus_data/brown_single/cf_cf22.txt
Processed: ../../../corpus_data/brown_single/ck_ck15.txt
Processed: ../../../corpus_data/brown_single/ck_ck01.txt
Processed: ../../../corpus_data/brown_single/cf_cf36.txt
Processed: ../../../corpus_data/brown_single/ck_ck29.txt
Processed: ../../../corpus_data/brown_single/cf_cf32.txt
Processed: ../../../corpus_data/brown_single/ck_ck05.txt
Processed: ../../../corpus_data/brown_single/ck_ck11.txt
Processed: ../../../corpus_data/brown_single/cf_cf26.txt
Processed: ../../../corpus_data/brown_single/ca_ca19.txt
Processed: ../../../corpus_data/brown_single/cl_cl12.txt
Processed: ../../../corpus_data/brown_single/ca_ca25.txt
Processed: ../../../corpus_data/brown_single/ca_ca31.txt
Processed: ../../../corpus_data/brown_single/cl_cl06.txt
Processed: ../../../corpus_data/brown_single/ch_ch08.txt
Processed: ../../../corpus_data/brown_single/ce_ce03.txt
Processed: ../../../corpus_data/brown_single/ch_ch20.txt
Processed: ../../../corpus_data/brown_single/ce_ce17.txt
Processed: ../../../corpus_data/brown_single/cb_cb14.txt
Processed: ../../../corpus_data/brown_single/cj_cj65.txt
Processed: ../../../corpus_data/brown_single/cg_cg52.txt
Processed: ../../../corpus_data/brown_single/cg_cg46.txt
Processed: ../../../corpus_data/brown_single/cj_cj71.txt
Processed: ../../../corpus_data/brown_single/cp_cp23.txt
Processed: ../../../corpus_data/brown_single/cj_cj59.txt
Processed: ../../../corpus_data/brown_single/cj_cj58.txt
Processed: ../../../corpus_data/brown_single/cp_cp22.txt
Processed: ../../../corpus_data/brown_single/cg_cg47.txt
Processed: ../../../corpus_data/brown_single/cj_cj70.txt
Processed: ../../../corpus_data/brown_single/cj_cj64.txt
Processed: ../../../corpus_data/brown_single/cg_cg53.txt
Processed: ../../../corpus_data/brown_single/cb_cb01.txt
Processed: ../../../corpus_data/brown_single/cb_cb15.txt
Processed: ../../../corpus_data/brown_single/ch_ch21.txt
Processed: ../../../corpus_data/brown_single/ce_ce16.txt
Processed: ../../../corpus_data/brown_single/ce_ce02.txt
Processed: ../../../corpus_data/brown_single/ch_ch09.txt
Processed: ../../../corpus_data/brown_single/ca_ca30.txt
Processed: ../../../corpus_data/brown_single/cl_cl07.txt
Processed: ../../../corpus_data/brown_single/cl_cl13.txt
Processed: ../../../corpus_data/brown_single/ca_ca24.txt
Processed: ../../../corpus_data/brown_single/ca_ca18.txt
Processed: ../../../corpus_data/brown_single/ck_ck10.txt
Processed: ../../../corpus_data/brown_single/cf_cf27.txt
Processed: ../../../corpus_data/brown_single/cf_cf33.txt
Processed: ../../../corpus_data/brown_single/ck_ck04.txt
Processed: ../../../corpus_data/brown_single/cf_cf25.txt
Processed: ../../../corpus_data/brown_single/ck_ck12.txt
Processed: ../../../corpus_data/brown_single/ck_ck06.txt
Processed: ../../../corpus_data/brown_single/cf_cf31.txt
Processed: ../../../corpus_data/brown_single/cf_cf19.txt
Processed: ../../../corpus_data/brown_single/cl_cl05.txt
Processed: ../../../corpus_data/brown_single/ca_ca32.txt
Processed: ../../../corpus_data/brown_single/ca_ca26.txt
Processed: ../../../corpus_data/brown_single/cl_cl11.txt
Processed: ../../../corpus_data/brown_single/ce_ce28.txt
Processed: ../../../corpus_data/brown_single/ce_ce14.txt
Processed: ../../../corpus_data/brown_single/ch_ch23.txt
Processed: ../../../corpus_data/brown_single/cb_cb03.txt
Processed: ../../../corpus_data/brown_single/cb_cb17.txt
Processed: ../../../corpus_data/brown_single/cp_cp08.txt
Processed: ../../../corpus_data/brown_single/cj_cj72.txt
Processed: ../../../corpus_data/brown_single/cg_cg45.txt
Processed: ../../../corpus_data/brown_single/cg_cg51.txt
Processed: ../../../corpus_data/brown_single/cj_cj66.txt
Processed: ../../../corpus_data/brown_single/cp_cp20.txt
Processed: ../../../corpus_data/brown_single/cp_cp21.txt
Processed: ../../../corpus_data/brown_single/cg_cg50.txt
Processed: ../../../corpus_data/brown_single/cj_cj67.txt
Processed: ../../../corpus_data/brown_single/cj_cj73.txt
Processed: ../../../corpus_data/brown_single/cp_cp09.txt
Processed: ../../../corpus_data/brown_single/cg_cg44.txt
Processed: ../../../corpus_data/brown_single/cb_cb16.txt
Processed: ../../../corpus_data/brown_single/cb_cb02.txt
Processed: ../../../corpus_data/brown_single/ce_ce01.txt
Processed: ../../../corpus_data/brown_single/ce_ce15.txt
Processed: ../../../corpus_data/brown_single/ch_ch22.txt
Processed: ../../../corpus_data/brown_single/ce_ce29.txt
Processed: ../../../corpus_data/brown_single/ca_ca27.txt
Processed: ../../../corpus_data/brown_single/cl_cl10.txt
Processed: ../../../corpus_data/brown_single/cl_cl04.txt
Processed: ../../../corpus_data/brown_single/ca_ca33.txt
Processed: ../../../corpus_data/brown_single/cf_cf18.txt
Processed: ../../../corpus_data/brown_single/ck_ck07.txt
Processed: ../../../corpus_data/brown_single/cf_cf30.txt
Processed: ../../../corpus_data/brown_single/cf_cf24.txt
Processed: ../../../corpus_data/brown_single/ck_ck13.txt
Processed: ../../../corpus_data/brown_single/cc_cc05.txt
Processed: ../../../corpus_data/brown_single/cn_cn26.txt
Processed: ../../../corpus_data/brown_single/cc_cc11.txt
Processed: ../../../corpus_data/brown_single/cf_cf43.txt
Processed: ../../../corpus_data/brown_single/cd_cd12.txt
Processed: ../../../corpus_data/brown_single/cd_cd06.txt
Processed: ../../../corpus_data/brown_single/ca_ca40.txt
Processed: ../../../corpus_data/brown_single/cr_cr03.txt
Processed: ../../../corpus_data/brown_single/cm_cm03.txt
Processed: ../../../corpus_data/brown_single/cj_cj28.txt
Processed: ../../../corpus_data/brown_single/cj_cj14.txt
Processed: ../../../corpus_data/brown_single/cg_cg23.txt
Processed: ../../../corpus_data/brown_single/cg_cg37.txt
Processed: ../../../corpus_data/brown_single/cg_cg36.txt
Processed: ../../../corpus_data/brown_single/cj_cj01.txt
Processed: ../../../corpus_data/brown_single/cj_cj15.txt
Processed: ../../../corpus_data/brown_single/cg_cg22.txt
Processed: ../../../corpus_data/brown_single/cj_cj29.txt
Processed: ../../../corpus_data/brown_single/cm_cm02.txt
Processed: ../../../corpus_data/brown_single/cr_cr02.txt
Processed: ../../../corpus_data/brown_single/ca_ca41.txt
Processed: ../../../corpus_data/brown_single/cd_cd07.txt
Processed: ../../../corpus_data/brown_single/cd_cd13.txt
Processed: ../../../corpus_data/brown_single/cf_cf42.txt
Processed: ../../../corpus_data/brown_single/cn_cn27.txt
Processed: ../../../corpus_data/brown_single/cc_cc10.txt
Processed: ../../../corpus_data/brown_single/cc_cc04.txt
Processed: ../../../corpus_data/brown_single/cn_cn19.txt
Processed: ../../../corpus_data/brown_single/cc_cc12.txt
Processed: ../../../corpus_data/brown_single/cn_cn25.txt
Processed: ../../../corpus_data/brown_single/cc_cc06.txt
Processed: ../../../corpus_data/brown_single/cf_cf40.txt
Processed: ../../../corpus_data/brown_single/cd_cd05.txt
Processed: ../../../corpus_data/brown_single/cd_cd11.txt
Processed: ../../../corpus_data/brown_single/ca_ca43.txt
Processed: ../../../corpus_data/brown_single/cg_cg08.txt
Processed: ../../../corpus_data/brown_single/cj_cj03.txt
Processed: ../../../corpus_data/brown_single/cg_cg34.txt
Processed: ../../../corpus_data/brown_single/cg_cg20.txt
Processed: ../../../corpus_data/brown_single/cj_cj17.txt
Processed: ../../../corpus_data/brown_single/cg_cg21.txt
Processed: ../../../corpus_data/brown_single/cj_cj16.txt
Processed: ../../../corpus_data/brown_single/cj_cj02.txt
Processed: ../../../corpus_data/brown_single/cg_cg35.txt
Processed: ../../../corpus_data/brown_single/cg_cg09.txt
Processed: ../../../corpus_data/brown_single/cm_cm01.txt
Processed: ../../../corpus_data/brown_single/cr_cr01.txt
Processed: ../../../corpus_data/brown_single/ca_ca42.txt
Processed: ../../../corpus_data/brown_single/cd_cd10.txt
Processed: ../../../corpus_data/brown_single/cd_cd04.txt
Processed: ../../../corpus_data/brown_single/cf_cf41.txt
Processed: ../../../corpus_data/brown_single/cc_cc07.txt
Processed: ../../../corpus_data/brown_single/cc_cc13.txt
Processed: ../../../corpus_data/brown_single/cn_cn24.txt
Processed: ../../../corpus_data/brown_single/cn_cn18.txt
Processed: ../../../corpus_data/brown_single/cn_cn20.txt
Processed: ../../../corpus_data/brown_single/cc_cc17.txt
Processed: ../../../corpus_data/brown_single/cc_cc03.txt
Processed: ../../../corpus_data/brown_single/cn_cn08.txt
Processed: ../../../corpus_data/brown_single/cf_cf45.txt
Processed: ../../../corpus_data/brown_single/cd_cd14.txt
Processed: ../../../corpus_data/brown_single/cr_cr05.txt
Processed: ../../../corpus_data/brown_single/cm_cm05.txt
Processed: ../../../corpus_data/brown_single/cg_cg31.txt
Processed: ../../../corpus_data/brown_single/cj_cj06.txt
Processed: ../../../corpus_data/brown_single/cj_cj12.txt
Processed: ../../../corpus_data/brown_single/cg_cg25.txt
Processed: ../../../corpus_data/brown_single/cg_cg19.txt
Processed: ../../../corpus_data/brown_single/cg_cg18.txt
Processed: ../../../corpus_data/brown_single/cj_cj13.txt
Processed: ../../../corpus_data/brown_single/cg_cg24.txt
Processed: ../../../corpus_data/brown_single/cg_cg30.txt
Processed: ../../../corpus_data/brown_single/cj_cj07.txt
Processed: ../../../corpus_data/brown_single/cm_cm04.txt
Processed: ../../../corpus_data/brown_single/cr_cr04.txt
Processed: ../../../corpus_data/brown_single/cd_cd15.txt
Processed: ../../../corpus_data/brown_single/cd_cd01.txt
Processed: ../../../corpus_data/brown_single/cf_cf44.txt
Processed: ../../../corpus_data/brown_single/cn_cn09.txt
Processed: ../../../corpus_data/brown_single/cc_cc02.txt
Processed: ../../../corpus_data/brown_single/cn_cn21.txt
Processed: ../../../corpus_data/brown_single/cc_cc16.txt
Processed: ../../../corpus_data/brown_single/cc_cc14.txt
Processed: ../../../corpus_data/brown_single/cn_cn23.txt
Processed: ../../../corpus_data/brown_single/cf_cf46.txt
Processed: ../../../corpus_data/brown_single/cd_cd17.txt
Processed: ../../../corpus_data/brown_single/cd_cd03.txt
Processed: ../../../corpus_data/brown_single/cr_cr06.txt
Processed: ../../../corpus_data/brown_single/cm_cm06.txt
Processed: ../../../corpus_data/brown_single/cg_cg26.txt
Processed: ../../../corpus_data/brown_single/cj_cj11.txt
Processed: ../../../corpus_data/brown_single/cj_cj05.txt
Processed: ../../../corpus_data/brown_single/cg_cg32.txt
Processed: ../../../corpus_data/brown_single/cj_cj39.txt
Processed: ../../../corpus_data/brown_single/cj_cj38.txt
Processed: ../../../corpus_data/brown_single/cj_cj04.txt
Processed: ../../../corpus_data/brown_single/cg_cg33.txt
Processed: ../../../corpus_data/brown_single/cg_cg27.txt
Processed: ../../../corpus_data/brown_single/cj_cj10.txt
Processed: ../../../corpus_data/brown_single/cr_cr07.txt
Processed: ../../../corpus_data/brown_single/ca_ca44.txt
Processed: ../../../corpus_data/brown_single/cd_cd02.txt
Processed: ../../../corpus_data/brown_single/cd_cd16.txt
Processed: ../../../corpus_data/brown_single/cf_cf47.txt
Processed: ../../../corpus_data/brown_single/cc_cc15.txt
Processed: ../../../corpus_data/brown_single/cn_cn22.txt
Processed: ../../../corpus_data/brown_single/cc_cc01.txt
Processed: ../../../corpus_data/brown_single/cn_cn13.txt
Processed: ../../../corpus_data/brown_single/cn_cn07.txt
Processed: ../../../corpus_data/brown_single/cj_cj09.txt
Processed: ../../../corpus_data/brown_single/cj_cj35.txt
Processed: ../../../corpus_data/brown_single/cg_cg02.txt
Processed: ../../../corpus_data/brown_single/cg_cg16.txt
Processed: ../../../corpus_data/brown_single/cj_cj21.txt
Processed: ../../../corpus_data/brown_single/cg_cg17.txt
Processed: ../../../corpus_data/brown_single/cj_cj20.txt
Processed: ../../../corpus_data/brown_single/cj_cj34.txt
Processed: ../../../corpus_data/brown_single/cg_cg03.txt
Processed: ../../../corpus_data/brown_single/cj_cj08.txt
Processed: ../../../corpus_data/brown_single/cn_cn06.txt
Processed: ../../../corpus_data/brown_single/cn_cn12.txt
Processed: ../../../corpus_data/brown_single/cn_cn04.txt
Processed: ../../../corpus_data/brown_single/cn_cn10.txt
Processed: ../../../corpus_data/brown_single/cr_cr09.txt
Processed: ../../../corpus_data/brown_single/cg_cg29.txt
Processed: ../../../corpus_data/brown_single/cj_cj22.txt
Processed: ../../../corpus_data/brown_single/cg_cg15.txt
Processed: ../../../corpus_data/brown_single/cg_cg01.txt
Processed: ../../../corpus_data/brown_single/cj_cj36.txt
Processed: ../../../corpus_data/brown_single/cj_cj37.txt
Processed: ../../../corpus_data/brown_single/cj_cj23.txt
Processed: ../../../corpus_data/brown_single/cg_cg14.txt
Processed: ../../../corpus_data/brown_single/cg_cg28.txt
Processed: ../../../corpus_data/brown_single/cr_cr08.txt
Processed: ../../../corpus_data/brown_single/cf_cf48.txt
Processed: ../../../corpus_data/brown_single/cn_cn11.txt
Processed: ../../../corpus_data/brown_single/cn_cn05.txt
Processed: ../../../corpus_data/brown_single/cn_cn01.txt
Processed: ../../../corpus_data/brown_single/cn_cn15.txt
Processed: ../../../corpus_data/brown_single/cn_cn29.txt
Processed: ../../../corpus_data/brown_single/cd_cd09.txt
Processed: ../../../corpus_data/brown_single/cg_cg10.txt
Processed: ../../../corpus_data/brown_single/cj_cj27.txt
Processed: ../../../corpus_data/brown_single/cj_cj33.txt
Processed: ../../../corpus_data/brown_single/cg_cg04.txt
Processed: ../../../corpus_data/brown_single/cg_cg38.txt
Processed: ../../../corpus_data/brown_single/cg_cg39.txt
Processed: ../../../corpus_data/brown_single/cj_cj32.txt
Processed: ../../../corpus_data/brown_single/cg_cg05.txt
Processed: ../../../corpus_data/brown_single/cg_cg11.txt
Processed: ../../../corpus_data/brown_single/cj_cj26.txt
Processed: ../../../corpus_data/brown_single/cd_cd08.txt
Processed: ../../../corpus_data/brown_single/cn_cn28.txt
Processed: ../../../corpus_data/brown_single/cn_cn14.txt
Processed: ../../../corpus_data/brown_single/cn_cn16.txt
Processed: ../../../corpus_data/brown_single/cn_cn02.txt
Processed: ../../../corpus_data/brown_single/cc_cc09.txt
Processed: ../../../corpus_data/brown_single/cg_cg07.txt
Processed: ../../../corpus_data/brown_single/cj_cj30.txt
Processed: ../../../corpus_data/brown_single/cj_cj24.txt
Processed: ../../../corpus_data/brown_single/cg_cg13.txt
Processed: ../../../corpus_data/brown_single/cj_cj18.txt
Processed: ../../../corpus_data/brown_single/cj_cj19.txt
Processed: ../../../corpus_data/brown_single/cj_cj25.txt
Processed: ../../../corpus_data/brown_single/cg_cg12.txt
Processed: ../../../corpus_data/brown_single/cg_cg06.txt
Processed: ../../../corpus_data/brown_single/cj_cj31.txt
Processed: ../../../corpus_data/brown_single/cc_cc08.txt
Processed: ../../../corpus_data/brown_single/cn_cn03.txt
Processed: ../../../corpus_data/brown_single/cn_cn17.txt
Processed: ../../../corpus_data/brown_single/cf_cf29.txt
Processed: ../../../corpus_data/brown_single/cf_cf01.txt
Processed: ../../../corpus_data/brown_single/cf_cf15.txt
Processed: ../../../corpus_data/brown_single/ck_ck22.txt
Processed: ../../../corpus_data/brown_single/ca_ca16.txt
Processed: ../../../corpus_data/brown_single/cl_cl21.txt
Processed: ../../../corpus_data/brown_single/ca_ca02.txt
Processed: ../../../corpus_data/brown_single/cl_cl09.txt
Processed: ../../../corpus_data/brown_single/ch_ch07.txt
Processed: ../../../corpus_data/brown_single/ce_ce30.txt
Processed: ../../../corpus_data/brown_single/ce_ce24.txt
Processed: ../../../corpus_data/brown_single/ch_ch13.txt
Processed: ../../../corpus_data/brown_single/ce_ce18.txt
Processed: ../../../corpus_data/brown_single/cb_cb27.txt
Processed: ../../../corpus_data/brown_single/cp_cp10.txt
Processed: ../../../corpus_data/brown_single/cp_cp04.txt
Processed: ../../../corpus_data/brown_single/cg_cg49.txt
Processed: ../../../corpus_data/brown_single/cg_cg61.txt
Processed: ../../../corpus_data/brown_single/cj_cj56.txt
Processed: ../../../corpus_data/brown_single/cj_cj42.txt
Processed: ../../../corpus_data/brown_single/cg_cg75.txt
Processed: ../../../corpus_data/brown_single/cj_cj43.txt
Processed: ../../../corpus_data/brown_single/cg_cg74.txt
Processed: ../../../corpus_data/brown_single/cg_cg60.txt
Processed: ../../../corpus_data/brown_single/cj_cj57.txt
Processed: ../../../corpus_data/brown_single/cp_cp05.txt
Processed: ../../../corpus_data/brown_single/cg_cg48.txt
Processed: ../../../corpus_data/brown_single/cp_cp11.txt
Processed: ../../../corpus_data/brown_single/cj_cj80.txt
Processed: ../../../corpus_data/brown_single/cb_cb26.txt
Processed: ../../../corpus_data/brown_single/ce_ce19.txt
Processed: ../../../corpus_data/brown_single/ce_ce25.txt
Processed: ../../../corpus_data/brown_single/ch_ch12.txt
Processed: ../../../corpus_data/brown_single/ch_ch06.txt
Processed: ../../../corpus_data/brown_single/ce_ce31.txt
Processed: ../../../corpus_data/brown_single/cl_cl08.txt
Processed: ../../../corpus_data/brown_single/ca_ca03.txt
Processed: ../../../corpus_data/brown_single/ca_ca17.txt
Processed: ../../../corpus_data/brown_single/cl_cl20.txt
Processed: ../../../corpus_data/brown_single/cf_cf14.txt
Processed: ../../../corpus_data/brown_single/ck_ck23.txt
Processed: ../../../corpus_data/brown_single/cf_cf28.txt
Processed: ../../../corpus_data/brown_single/ck_ck09.txt
Processed: ../../../corpus_data/brown_single/ck_ck21.txt
Processed: ../../../corpus_data/brown_single/cf_cf16.txt
Processed: ../../../corpus_data/brown_single/cf_cf02.txt
Processed: ../../../corpus_data/brown_single/ca_ca01.txt
Processed: ../../../corpus_data/brown_single/cl_cl22.txt
Processed: ../../../corpus_data/brown_single/ca_ca15.txt
Processed: ../../../corpus_data/brown_single/ca_ca29.txt
Processed: ../../../corpus_data/brown_single/ch_ch10.txt
Processed: ../../../corpus_data/brown_single/ce_ce27.txt
Processed: ../../../corpus_data/brown_single/ce_ce33.txt
Processed: ../../../corpus_data/brown_single/ch_ch04.txt
Processed: ../../../corpus_data/brown_single/cb_cb18.txt
Processed: ../../../corpus_data/brown_single/cb_cb24.txt
Processed: ../../../corpus_data/brown_single/cp_cp07.txt
Processed: ../../../corpus_data/brown_single/cp_cp13.txt
Processed: ../../../corpus_data/brown_single/cj_cj69.txt
Processed: ../../../corpus_data/brown_single/cj_cj41.txt
Processed: ../../../corpus_data/brown_single/cj_cj55.txt
Processed: ../../../corpus_data/brown_single/cg_cg62.txt
Processed: ../../../corpus_data/brown_single/cj_cj54.txt
Processed: ../../../corpus_data/brown_single/cg_cg63.txt
Processed: ../../../corpus_data/brown_single/cj_cj40.txt
Processed: ../../../corpus_data/brown_single/cj_cj68.txt
Processed: ../../../corpus_data/brown_single/cp_cp12.txt
Processed: ../../../corpus_data/brown_single/cp_cp06.txt
Processed: ../../../corpus_data/brown_single/cb_cb25.txt
Processed: ../../../corpus_data/brown_single/cb_cb19.txt
Processed: ../../../corpus_data/brown_single/ce_ce32.txt
Processed: ../../../corpus_data/brown_single/ch_ch05.txt
Processed: ../../../corpus_data/brown_single/ch_ch11.txt
Processed: ../../../corpus_data/brown_single/ce_ce26.txt
Processed: ../../../corpus_data/brown_single/ca_ca28.txt
Processed: ../../../corpus_data/brown_single/cl_cl23.txt
Processed: ../../../corpus_data/brown_single/ca_ca14.txt
Processed: ../../../corpus_data/brown_single/cf_cf03.txt
Processed: ../../../corpus_data/brown_single/ck_ck20.txt
Processed: ../../../corpus_data/brown_single/cf_cf17.txt
Processed: ../../../corpus_data/brown_single/ck_ck08.txt
Processed: ../../../corpus_data/brown_single/cf_cf13.txt
Processed: ../../../corpus_data/brown_single/ck_ck24.txt
Processed: ../../../corpus_data/brown_single/cf_cf07.txt
Processed: ../../../corpus_data/brown_single/ck_ck18.txt
Processed: ../../../corpus_data/brown_single/ca_ca38.txt
Processed: ../../../corpus_data/brown_single/ca_ca04.txt
Processed: ../../../corpus_data/brown_single/ca_ca10.txt
Processed: ../../../corpus_data/brown_single/ch_ch29.txt
Processed: ../../../corpus_data/brown_single/ce_ce22.txt
Processed: ../../../corpus_data/brown_single/ch_ch15.txt
Processed: ../../../corpus_data/brown_single/ch_ch01.txt
Processed: ../../../corpus_data/brown_single/ce_ce36.txt
Processed: ../../../corpus_data/brown_single/cb_cb21.txt
Processed: ../../../corpus_data/brown_single/cb_cb09.txt
Processed: ../../../corpus_data/brown_single/cj_cj44.txt
Processed: ../../../corpus_data/brown_single/cg_cg73.txt
Processed: ../../../corpus_data/brown_single/cg_cg67.txt
Processed: ../../../corpus_data/brown_single/cj_cj50.txt
Processed: ../../../corpus_data/brown_single/cj_cj78.txt
Processed: ../../../corpus_data/brown_single/cp_cp02.txt
Processed: ../../../corpus_data/brown_single/cp_cp16.txt
Processed: ../../../corpus_data/brown_single/cp_cp17.txt
Processed: ../../../corpus_data/brown_single/cp_cp03.txt
Processed: ../../../corpus_data/brown_single/cj_cj79.txt
Processed: ../../../corpus_data/brown_single/cg_cg66.txt
Processed: ../../../corpus_data/brown_single/cj_cj51.txt
Processed: ../../../corpus_data/brown_single/cj_cj45.txt
Processed: ../../../corpus_data/brown_single/cg_cg72.txt
Processed: ../../../corpus_data/brown_single/cb_cb08.txt
Processed: ../../../corpus_data/brown_single/cb_cb20.txt
Processed: ../../../corpus_data/brown_single/ce_ce23.txt
Processed: ../../../corpus_data/brown_single/ch_ch14.txt
Processed: ../../../corpus_data/brown_single/ch_ch28.txt
Processed: ../../../corpus_data/brown_single/ca_ca11.txt
Processed: ../../../corpus_data/brown_single/ca_ca05.txt
Processed: ../../../corpus_data/brown_single/ca_ca39.txt
Processed: ../../../corpus_data/brown_single/ck_ck19.txt
Processed: ../../../corpus_data/brown_single/cf_cf06.txt
Processed: ../../../corpus_data/brown_single/cf_cf12.txt
Processed: ../../../corpus_data/brown_single/ck_ck25.txt
Processed: ../../../corpus_data/brown_single/cf_cf04.txt
Processed: ../../../corpus_data/brown_single/ck_ck27.txt
Processed: ../../../corpus_data/brown_single/cf_cf10.txt
Processed: ../../../corpus_data/brown_single/cf_cf38.txt
Processed: ../../../corpus_data/brown_single/cl_cl18.txt
Processed: ../../../corpus_data/brown_single/cl_cl24.txt
Processed: ../../../corpus_data/brown_single/ca_ca13.txt
Processed: ../../../corpus_data/brown_single/ca_ca07.txt
Processed: ../../../corpus_data/brown_single/ce_ce09.txt
Processed: ../../../corpus_data/brown_single/ce_ce35.txt
Processed: ../../../corpus_data/brown_single/ch_ch02.txt
Processed: ../../../corpus_data/brown_single/ch_ch16.txt
Processed: ../../../corpus_data/brown_single/ce_ce21.txt
Processed: ../../../corpus_data/brown_single/cb_cb22.txt
Processed: ../../../corpus_data/brown_single/cj_cj53.txt
Processed: ../../../corpus_data/brown_single/cp_cp29.txt
Processed: ../../../corpus_data/brown_single/cg_cg64.txt
Processed: ../../../corpus_data/brown_single/cg_cg70.txt
Processed: ../../../corpus_data/brown_single/cj_cj47.txt
Processed: ../../../corpus_data/brown_single/cp_cp15.txt
Processed: ../../../corpus_data/brown_single/cg_cg58.txt
Processed: ../../../corpus_data/brown_single/cp_cp01.txt
Processed: ../../../corpus_data/brown_single/cp_cp14.txt
Processed: ../../../corpus_data/brown_single/cg_cg59.txt
Processed: ../../../corpus_data/brown_single/cg_cg71.txt
Processed: ../../../corpus_data/brown_single/cj_cj46.txt
Processed: ../../../corpus_data/brown_single/cp_cp28.txt
Processed: ../../../corpus_data/brown_single/cj_cj52.txt
Processed: ../../../corpus_data/brown_single/cg_cg65.txt
Processed: ../../../corpus_data/brown_single/cb_cb23.txt
Processed: ../../../corpus_data/brown_single/ch_ch17.txt
Processed: ../../../corpus_data/brown_single/ce_ce20.txt
Processed: ../../../corpus_data/brown_single/ce_ce34.txt
Processed: ../../../corpus_data/brown_single/ch_ch03.txt
Processed: ../../../corpus_data/brown_single/ce_ce08.txt
Processed: ../../../corpus_data/brown_single/ca_ca06.txt
Processed: ../../../corpus_data/brown_single/ca_ca12.txt
Processed: ../../../corpus_data/brown_single/cl_cl19.txt
Processed: ../../../corpus_data/brown_single/cf_cf39.txt
Processed: ../../../corpus_data/brown_single/ck_ck26.txt
Processed: ../../../corpus_data/brown_single/cf_cf11.txt
Processed: ../../../corpus_data/brown_single/cf_cf05.txt
Corpus size: 1242331 tokens
Unique words: 34947
Unique collocations: 81932
In the code above, the collocation has been extracted, but the data format is not human readable. In the following I will transform the data to other format.
def results_to_dataframe(results, min_freq=1):
"""
Convert results dictionary to pandas DataFrame with additional options
Parameters:
- results: dictionary with corpus_size, unigram, and bigram data
- min_freq: minimum collocation frequency to include (default: 1)
- include_dep_rel: whether to include dependency relation in output (default: True)
"""
rows = []
for bigram_key, bigram_info in results["bigram"].items():
# Skip if below minimum frequency
if bigram_info['freq'] < min_freq:
continue
row = {
"collocation": f"{bigram_info['word1']}_{bigram_info['word2']}",
"word1": bigram_info['word1'],
"word2": bigram_info['word2'],
"dep_relation": bigram_info['dep_rel'],
"corpus_size": results['corpus_size'],
"collocation_frequency": bigram_info['freq'],
"word1_freq": results["unigram"].get(bigram_info['word1'], 0),
"word2_freq": results["unigram"].get(bigram_info['word2'], 0)
}
rows.append(row)
# Create DataFrame and sort by collocation frequency
df = pd.DataFrame(rows)
if not df.empty:
df = df.sort_values('collocation_frequency', ascending=False)
df = df.reset_index(drop=True)
return df
# Example with filtering
df_collocations = results_to_dataframe(results, min_freq=5)
# Display first 20 rows
print("Top 20 collocations:")
print(df_collocations.head(20))
# Show DataFrame info
print(f"\nTotal collocations found: {len(df_collocations)}")
print(f"\nDataFrame columns: {list(df_collocations.columns)}")Top 20 collocations:
collocation word1 ... word1_freq word2_freq
0 last_year last ... 705 1789
1 same_time same ... 686 1970
2 young_man young ... 442 2149
3 take_place take ... 1585 836
4 fiscal_year fiscal ... 120 1789
5 high_school high ... 801 711
6 old_man old ... 857 2149
7 first_time first ... 1388 1970
8 last_night last ... 705 468
9 other_hand other ... 2032 801
10 per_cent per ... 380 194
11 last_week last ... 705 450
12 middle_class middle ... 178 352
13 nineteenth_century nineteenth ... 58 307
14 have_time have ... 11908 1970
15 long_time long ... 1095 1970
16 few_year few ... 631 1789
17 index_word index ... 90 558
18 long_range long ... 1095 264
19 great_deal great ... 940 223
[20 rows x 8 columns]
Total collocations found: 1844
DataFrame columns: ['collocation', 'word1', 'word2', 'dep_relation', 'corpus_size', 'collocation_frequency', 'word1_freq', 'word2_freq']
Now because the data is still big, let’s filter them down to manageable size.
First, we should filter it with frequency. Let’s retain collocation that occur more than 10 times.
Second, let’s randomly sample 50 from this pool.
| collocation | word1 | word2 | dep_relation | corpus_size | collocation_frequency | word1_freq | word2_freq | |
|---|---|---|---|---|---|---|---|---|
| 1 | last_year | last | year | amod | 1242331 | 123 | 705 | 1789 |
| 2 | same_time | same | time | amod | 1242331 | 94 | 686 | 1970 |
| 3 | young_man | young | man | amod | 1242331 | 89 | 442 | 2149 |
| 4 | take_place | take | place | dobj | 1242331 | 84 | 1585 | 836 |
| 5 | fiscal_year | fiscal | year | amod | 1242331 | 76 | 120 | 1789 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 375 | mystery_story | mystery | story | compound | 1242331 | 11 | 47 | 234 |
| 376 | get_job | get | job | dobj | 1242331 | 11 | 1460 | 308 |
| 377 | high_cost | high | cost | amod | 1242331 | 11 | 801 | 431 |
| 378 | onset_age | onset | age | compound | 1242331 | 11 | 43 | 284 |
| 379 | far_end | far | end | amod | 1242331 | 11 | 581 | 583 |
379 rows × 8 columns