Regexps for social data
Project description
social-regexp
Installation
pip install -U social-regexp
Methods
>>> import social_regexp as sre
>>> text = "Hi, my Twitter is @tez_romach"
>>> sre.remove_mentions(text, sre.MENTION_TOKEN)
"Hi, my Twitter is <men>"
Full list of methods available here:
def not_contains_non_russian_cyrillic_letters(text: str) -> bool:
"""Checks if a text contains any non-russian but cyrillic letter."""
def url() -> Pattern[str]:
"""Returns a pattern to match URLs."""
def spaces_before_punctuation() -> Pattern[str]:
"""Returns a pattern to match spaces before punctuation."""
def single_letter_words() -> Pattern[str]:
"""Returns a pattern to match single letter words."""
def blank_spaces() -> Pattern[str]:
"""Returns a pattern to match blank spaces."""
def mentions() -> Pattern[str]:
"""Returns a pattern to match mentions from Twitter or Instagram."""
def phones() -> Pattern[str]:
"""Returns a pattern to match phone numbers."""
def remove_urls(text: str, repl: str = "") -> str:
"""Return new string with replaced URLs to `repl`."""
def remove_spaces_before_punctuation(text: str) -> str:
"""Return new string without spaces before punctuations."""
def remove_punctuation(text: str) -> str:
"""Return new string without punctuations."""
def remove_mentions(text: str, repl: str = "") -> str:
"""Return new string with replaced Twitter/Instagram mentions to `repl`."""
def remove_single_letter_words(text: str) -> str:
"""Return new string without single-letter words."""
def remove_blank_spaces(text: str) -> str:
"""Return new string without blank spaces."""
def remove_phones(text: str, repl: str = "") -> str:
"""Return new string with replaced phone numbers to `repl`."""
def preprocess_text(text: str) -> str:
"""Return new string with tokenized and processed text."""
result = remove_mentions(text, repl=MENTION_TOKEN)
result = remove_phones(result, repl=PHONE_TOKEN)
result = remove_urls(result, repl=URL_TOKEN)
result = remove_blank_spaces(result).strip()
result = remove_spaces_before_punctuation(result)
return result
Constants
MENTION_TOKEN = "<men>"
URL_TOKEN = "<url>"
PHONE_TOKEN = "<phn>"
HASH_TOKEN = "<hsh>"
ALL_TOKENS = [MENTION_TOKEN, URL_TOKEN, PHONE_TOKEN, HASH_TOKEN]
NON_RUSSIAN_CYRILLIC_LETTERS = {
"ә", "җ", "ң", "ө", "ү",
"қ", "ӯ", "ҳ", "ҷ", "ғ",
"ұ", "ә", "һ", "ґ", "є",
"ї", "ӑ", "ӗ", "ҫ", "ӳ",
"ҝ", "ғ", "ҹ",
}
🛡 License
This project is licensed under the terms of the MIT
license. See LICENSE for more details.
📃 Citation
@misc{social-regexp,
author = {TezRomacH},
title = {Regexps for social data},
year = {2021},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/TezRomacH/social-regexp}}
}
Credits
This project was generated with python-package-template
.
Project details
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
social-regexp-0.2.1.tar.gz
(5.4 kB
view details)
File details
Details for the file
social-regexp-0.2.1.tar.gz
.File metadata
File hashes
65dd2bb725c3b509d42bcaec08f6381703c98c92296da7520f113a86710acaa8
aeb4b7dfdc150c4297d99f70c12e9e23
8d393b9eed1e505dd3a1289630b42e1c9f934e63b2c81002d8849c0c8e7d5ee7
See more details on using hashes here.