lxml to pandas for fast web scraping
Project description
lxml to pandas for fast web scraping
Tested against Windows / Python 3.11 / Anaconda
pip install lxml2pandas
from lxml2pandas import subprocess_parsing
from PrettyColorPrinter import add_printer
add_printer(1)
htmldata = [
("bet365", r"C:\Users\hansc\Downloads\bet365 - Apostas Desportivas Online.mhtml"),
(
"betano",
r"C:\Users\hansc\Downloads\Brasil Brasileirão - Série A Apostas - Futebol Odds _ Betano.mhtml",
),
("sportingbet", r"C:\Users\hansc\Downloads\Apostas Futebol _ Sportingbet.mhtml"),
]
allframes = []
df = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
)
for child in df.loc[df.aa_attr_values == "ovm-Fixture_Container"].aa_all_children:
dfr = df.loc[
df.aa_element_id.isin(child)
& (
(df.aa_attr_values == "ovm-FixtureDetailsTwoWay_TeamName")
| (df.aa_attr_values == "ovm-ParticipantOddsOnly_Odds")
)
]
if len(dfr) == 5:
print(dfr)
chi = df.loc[df.aa_attr_values == "events-list__grid__event"].aa_all_children
for c in chi:
print(
df.loc[
(df.aa_element_id.isin(c))
& (df.aa_doc_id == "betano")
& (
(
(df.aa_tag == "span")
& (df.aa_attr_values == "selections__selection__odd")
)
| (
(df.aa_tag == "span")
& (df.aa_attr_values.str.contains("participant-name", na=False))
)
)
]
)
df = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
)
# pre-filter
df0 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=("span", "div"),
)
df1 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=("span",),
)
df2 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=("span", "div"),
allowed_attr=(
"ovm-Fixture_Container",
"ovm-FixtureDetailsTwoWay_TeamName",
"ovm-ParticipantOddsOnly_Odds",
),
)
df3 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=(),
allowed_attr=("ovm-ParticipantOddsOnly_Odds",),
forbidden_tags=("p",),
)
df4 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=(),
allowed_attr=(
"ovm-Fixture_Container",
"ovm-FixtureDetailsTwoWay_TeamName",
"ovm-ParticipantOddsOnly_Odds",
"events-list__grid__even",
"selections__selection__odd",
"events-list__grid__info__main__participants__participant-name tw-truncate",
),
allowed_attr_keys=("class",),
)
df5 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=(),
allowed_attr=("ovm-Fixture_Container",),
allowed_attr_keys=("class",),
)
# parse a webpage:
df = subprocess_parsing(
[("python", "https://www.python.org/")],
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=(),
allowed_attr=(),
allowed_attr_keys=(),
)
Project details
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
lxml2pandas-0.13.tar.gz
(40.1 kB
view hashes)
Built Distribution
lxml2pandas-0.13-py3-none-any.whl
(39.9 kB
view hashes)
Close
Hashes for lxml2pandas-0.13-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 3bbcddcda41b279fa6c032f3f22e102f0cff59afb56762c0606f29d37fe1c6cb |
|
MD5 | 2e9cc1fefdc7ee3e92b537778e36ff49 |
|
BLAKE2b-256 | 2ee719de6224beedd1d4289bf7cf229cc946252e4c76d5915c9648eef67d1d2c |