Regexps for social data
Project description
social-regexp
Installation
pip install -U social-regexp
Methods
>>> import social_regexp as sre
>>> text = "Hi, my Twitter is @tez_romach"
>>> sre.remove_mentions(text, sre.MENTION_TOKEN)
"Hi, my Twitter is <men>"
Full list of methods available here:
def not_contains_non_russian_cyrillic_letters(text: str) -> bool:
"""Checks if a text contains any non-russian but cyrillic letter."""
def url() -> Pattern[str]:
"""Returns a pattern to match URLs."""
def spaces_before_punctuation() -> Pattern[str]:
"""Returns a pattern to match spaces before punctuation."""
def single_letter_words() -> Pattern[str]:
"""Returns a pattern to match single letter words."""
def blank_spaces() -> Pattern[str]:
"""Returns a pattern to match blank spaces."""
def mentions() -> Pattern[str]:
"""Returns a pattern to match mentions from Twitter or Instagram."""
def phones() -> Pattern[str]:
"""Returns a pattern to match phone numbers."""
def remove_urls(text: str, repl: str = "") -> str:
"""Return new string with replaced URLs to `repl`."""
def remove_spaces_before_punctuation(text: str) -> str:
"""Return new string without spaces before punctuations."""
def remove_punctuation(text: str) -> str:
"""Return new string without punctuations."""
def remove_mentions(text: str, repl: str = "") -> str:
"""Return new string with replaced Twitter/Instagram mentions to `repl`."""
def remove_single_letter_words(text: str) -> str:
"""Return new string without single-letter words."""
def remove_blank_spaces(text: str) -> str:
"""Return new string without blank spaces."""
def remove_phones(text: str, repl: str = "") -> str:
"""Return new string with replaced phone numbers to `repl`."""
def preprocess_text(text: str) -> str:
"""Return new string with tokenized and processed text."""
result = remove_mentions(text, repl=MENTION_TOKEN)
result = remove_phones(result, repl=PHONE_TOKEN)
result = remove_urls(result, repl=URL_TOKEN)
result = remove_blank_spaces(result).strip()
result = remove_spaces_before_punctuation(result)
return result
Constants
MENTION_TOKEN = "<men>"
URL_TOKEN = "<url>"
PHONE_TOKEN = "<phn>"
HASH_TOKEN = "<hsh>"
ALL_TOKENS = [MENTION_TOKEN, URL_TOKEN, PHONE_TOKEN, HASH_TOKEN]
NON_RUSSIAN_CYRILLIC_LETTERS = {
"ә", "җ", "ң", "ө", "ү",
"қ", "ӯ", "ҳ", "ҷ", "ғ",
"ұ", "ә", "һ", "ґ", "є",
"ї", "ӑ", "ӗ", "ҫ", "ӳ",
"ҝ", "ғ", "ҹ",
}
🛡 License
This project is licensed under the terms of the MIT
license. See LICENSE for more details.
📃 Citation
@misc{social-regexp,
author = {TezRomacH},
title = {Regexps for social data},
year = {2021},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/TezRomacH/social-regexp}}
}
Credits
This project was generated with python-package-template
.
Project details
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
social-regexp-0.2.1.tar.gz
(5.4 kB
view hashes)
Built Distribution
Close
Hashes for social_regexp-0.2.1-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 9a7d5da2052730d94b0a1b780f417431214107bfee90ce38558cf2207528408f |
|
MD5 | 54cbae4adf9dfb681d8d2f539537c04f |
|
BLAKE2b-256 | ce4941565852b26bc91496e178b54a6d0f03e0f196e4e4090a739c6cc4fcac84 |