Skip to main content

Collocation extraction from segmented texts

Project description

Collocation

Installation

pip install collocation

Usage

from collocation import Collocation

# Prepare corpus data
# https://yongfu.name/collocation/sampled_PTTposts.txt
corpus = []
with open("sampled_PTTposts.txt", encoding="utf-8") as f:
    for sent in f.read().split("\n"):
        if sent.strip() == "": continue
        sentence = []
        for tk in sent.split("\u3000"):
            if tk == "": continue
            sentence.append(tk)
        corpus.append(sentence)

>>> corpus[:7]
[['物品', '名稱', ':', '學生證'],
 ['拾獲', '地點', ':', '大一女', '前'],
 ['拾獲', '時間', ':', '6', '/', '21'],
 ['18', ':', '20', '左右'],
 ['物品', '描述', ':', '就', '一', '張', '學生證'],
 ['聯絡', '方式', ':', '站', '內', '信'],
 ['其他', '說明', ':', '請', '失主', '或', '朋友', '速速', '聯絡', '喔']]


# Initialize
c = Collocation(corpus, left_window=3, right_window=3)
# Query
>>> c.get_topn_collocates("[臺台]灣", cutoff=3, n=3, by="MI", chinese_only=True)
[('臺灣', '國立',
  {'MI': 9.801006087045614,
   'Xsq': 3560.8618187084653,
   'Gsq': 46.973839463946646,
   'Dice': 0.04519774011299435,
   'DeltaP21': 0.0277504097342154,
   'DeltaP12': 0.12108001346126511,
   'RawCount': 4}),
 ('臺灣', '聯盟',
  {'MI': 9.064040492879409,
   'Xsq': 2133.3656286224623,
   'Gsq': 42.68555772916195,
   'Dice': 0.04020100502512563,
   'DeltaP21': 0.0277296477701336,
   'DeltaP12': 0.0725951622338306,
   'RawCount': 4}),
 ('臺灣', '大學',
  {'MI': 8.428314878476366,
   'Xsq': 3768.760643213294,
   'Gsq': 107.96714978953916,
   'Dice': 0.05804749340369393,
   'DeltaP21': 0.07617749434551055,
   'DeltaP12': 0.046682984348090525,
   'RawCount': 11})]

# Acess documentation of parameters
help(c.get_topn_collocates)

Custom Association Measures

from math import log2
from collocation.association import FisherAttract, Dice


def logDice(O11, O12, O21, O22, E11, E12, E21, E22):
    D = Dice(O11, O12, O21, O22, E11, E12, E21, E22)
    return 14 + log2(D)

c.association_measures = [FisherAttract, logDice]

>>> c.get_topn_collocates("[臺台]灣", cutoff=3, n=3, by="logDice", chinese_only=True)
[('臺灣', '大學',
  {'FisherAttract': 56.04305766162403,
   'logDice': 9.893377580466206,
   'RawCount': 11}),
 ('臺灣', '國立',
  {'FisherAttract': 25.040705008265565,
   'logDice': 9.532394449917003,
   'RawCount': 4}),
 ('臺灣', '聯盟',
  {'FisherAttract': 22.922605012581965,
   'logDice': 9.36337537945635,
   'RawCount': 4})]

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

collocation-0.0.7.tar.gz (6.3 kB view hashes)

Uploaded Source

Built Distribution

collocation-0.0.7-py3-none-any.whl (6.6 kB view hashes)

Uploaded Python 3

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page