A simple helper for data parsing.
Project description
Parsing steps
A simple helper for data parsing.
SimpleParsingStep usage example
import time
from parsing_steps import SimpleParsingStep
class ParseMainPage(SimpleParsingStep):
"""
input_data is main page url.
input_data = "https://domain.com/"
"""
def download(self, url):
time.sleep(0.1)
return '<html>...<a href="https://domain.com/category/15/"></a>...</html>'
def get_category_urls(self, html_data):
"""Plug"""
return [f"https://domain/category/{num}/" for num in range(15, 16)]
def parse(self, input_data):
url = input_data
html_data = self.download(url)
for category_url in self.get_category_urls(html_data):
yield ParseListStep({"url": category_url})
class ParseListStep(SimpleParsingStep):
"""
input_data from previous step.
input_data = {
"url": "https://domain.com/category/<id>/"
}
"""
def download(self, url):
time.sleep(0.1)
if "?page=2" in url:
return "<html>...</html>"
return '<html>...<a href="https://domain.com/category/15/?page=2"></a>...</html>'
def get_item_urls(self, html_data):
"""Plug"""
return [f"https://domain/category/15/item/{num}/" for num in range(10)]
def exists_next_page(self, html_data):
return "?page=" in html_data
def parse(self, input_data):
print(f"category_url: {input_data.get('url')}")
# here you get data from html, through bs4 or something else.
html_data = self.download(input_data.get("url"))
for item_url in self.get_item_urls(html_data):
yield ParseDetailsStep(
input_data={"url": item_url},
inherited_data={ # optional part
"category_id": 15,
"page": 1 if self.exists_next_page(html_data) else 2
}
)
if self.exists_next_page(html_data):
# get this data from html
next_page_url = "https://domain.com/category/15/?page=2"
yield ParseListStep(
input_data={"url": next_page_url}
)
class ParseDetailsStep(SimpleParsingStep):
"""
input_data from previous step.
input_data = {
"url": "https://domain/category/15/item/{num}/"
}
"""
def download_and_format(self, url):
"""plug"""
time.sleep(0.2)
num = int(list(filter(None, str(url).split("/")))[-1]) # returns num
page = int(self.inherited_data.get("page"))
return {
"id": page*10 + int(num),
"name": f"product_{page*10 + num}",
"price": num*10,
}
def parse(self, input_data):
product_data = self.download_and_format(input_data.get("url"))
yield SaveStep(product_data)
class SaveStep(SimpleParsingStep):
"""
input_data from previous step.
input_data = {
"id": 12345,
"name": "product",
"price": 16.50,
}
"""
def save_to_db(self, data_as_dict):
self.inherited_data = self.inherited_data or dict()
data_as_dict.update(self.inherited_data)
print(data_as_dict)
def parse(self, input_data):
self.save_to_db(input_data)
if __name__ == "__main__":
first_step = ParseMainPage(input_data="https://domain.com/")
first_step.perform() # start scraping
Project details
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
parsing_steps-0.0.5.tar.gz
(12.4 kB
view details)
Built Distribution
File details
Details for the file parsing_steps-0.0.5.tar.gz
.
File metadata
- Download URL: parsing_steps-0.0.5.tar.gz
- Upload date:
- Size: 12.4 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/4.0.2 CPython/3.10.6
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 4121fb4c7dd60ee6cfe98d4a5934f1c97197e5248ad54dacdc24fb6710f1d3f9 |
|
MD5 | 674ecfc46337d4c4b4e16501c03468ef |
|
BLAKE2b-256 | 54c98c3b825db649fafc3bc6ad57edfc80beab2f6b427f0ba3037c56b0582c3f |
File details
Details for the file parsing_steps-0.0.5-py3-none-any.whl
.
File metadata
- Download URL: parsing_steps-0.0.5-py3-none-any.whl
- Upload date:
- Size: 16.4 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/4.0.2 CPython/3.10.6
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 8f502567f356f77f9de249af8b26d183128301b02b06c1a777c9c00c1eb5155d |
|
MD5 | 997b1e0827df5e246916919c8c5883c0 |
|
BLAKE2b-256 | e3b5b25ab76e85f8a9e143c4b00ac863ddb0a74258d6025aa749e593ece425d2 |