Collocation extraction from segmented texts
Project description
Collocation
Installation
pip install collocation
Usage
from collocation import Collocation
# Prepare corpus data
# https://yongfu.name/collocation/sampled_PTTposts.txt
corpus = []
with open("sampled_PTTposts.txt", encoding="utf-8") as f:
for sent in f.read().split("\n"):
if sent.strip() == "": continue
sentence = []
for tk in sent.split("\u3000"):
if tk == "": continue
sentence.append(tk)
corpus.append(sentence)
>>> corpus[:7]
[['物品', '名稱', ':', '學生證'],
['拾獲', '地點', ':', '大一女', '前'],
['拾獲', '時間', ':', '6', '/', '21'],
['18', ':', '20', '左右'],
['物品', '描述', ':', '就', '一', '張', '學生證'],
['聯絡', '方式', ':', '站', '內', '信'],
['其他', '說明', ':', '請', '失主', '或', '朋友', '速速', '聯絡', '喔']]
# Initialize
c = Collocation(corpus, left_window=3, right_window=3)
# Query
>>> c.get_topn_collocates("[臺台]灣", cutoff=3, n=3, by="MI", chinese_only=True)
[('臺灣', '國立',
{'MI': 9.801006087045614,
'Xsq': 3560.8618187084653,
'Gsq': 46.973839463946646,
'Dice': 0.04519774011299435,
'DeltaP21': 0.0277504097342154,
'DeltaP12': 0.12108001346126511,
'RawCount': 4}),
('臺灣', '聯盟',
{'MI': 9.064040492879409,
'Xsq': 2133.3656286224623,
'Gsq': 42.68555772916195,
'Dice': 0.04020100502512563,
'DeltaP21': 0.0277296477701336,
'DeltaP12': 0.0725951622338306,
'RawCount': 4}),
('臺灣', '大學',
{'MI': 8.428314878476366,
'Xsq': 3768.760643213294,
'Gsq': 107.96714978953916,
'Dice': 0.05804749340369393,
'DeltaP21': 0.07617749434551055,
'DeltaP12': 0.046682984348090525,
'RawCount': 11})]
# Acess documentation of parameters
help(c.get_topn_collocates)
Custom Association Measures
from math import log2
from collocation.association import FisherAttract, Dice
def logDice(O11, O12, O21, O22, E11, E12, E21, E22):
D = Dice(O11, O12, O21, O22, E11, E12, E21, E22)
return 14 + log2(D)
c.association_measures = [FisherAttract, logDice]
>>> c.get_topn_collocates("[臺台]灣", cutoff=3, n=3, by="logDice", chinese_only=True)
[('臺灣', '大學',
{'FisherAttract': 56.04305766162403,
'logDice': 9.893377580466206,
'RawCount': 11}),
('臺灣', '國立',
{'FisherAttract': 25.040705008265565,
'logDice': 9.532394449917003,
'RawCount': 4}),
('臺灣', '聯盟',
{'FisherAttract': 22.922605012581965,
'logDice': 9.36337537945635,
'RawCount': 4})]
Project details
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
collocation-0.0.7.tar.gz
(6.3 kB
view hashes)
Built Distribution
Close
Hashes for collocation-0.0.7-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | e43152824c7c7e87d2794fa514e2069163237af9cd2a5a083abf1b62bf160471 |
|
MD5 | ded13f6e7814a8b36360dd6cbe42a8d8 |
|
BLAKE2b-256 | b707d86e79971dc2f43309840157f7b9d2e6381fe5dc977dca60e4f936f3770d |