Roboparse HTML
Project description
roboparse
Simple utility which helps to organize code of your scraper.
Example
Go to the example
directory.
Installation
- Via pip
pip install roboparser
- Via git
git clone https://github.com/Toffooo/roboparse.git
cd roboparse
pip install -e .
Usage
- Structure of project
You can create router
for whole web service if you have a small scraper.
Or you can divide it to small routers.
routers.py
from roboparse import BaseRouter
class HabrRouterNews(BaseRouter): # Small router for every feature
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
def filter_data(self, data):
"""Filter/sort your data"""
def get(self):
response = self.create_router_response(
path="https://habr.com/ru/", # Path is just meta data. It uses for nothing
linter={
"type": "LIST",
"tag": "li",
"attrs": {"class": "content-list__item"},
"children": {
"type": "ELEMENT",
"tag": "h2",
"attrs": {"class": "post__title"},
"children": {
"type": "ELEMENT",
"tag": "a",
"attrs": {"class": "post__title_link"}
}
}
}
)
return response
class HabrRouter(BaseRouter): # One router for whole web service
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
def get_news(self):
"""
Create router response and return it
"""
response = self.create_router_response(
path="https://habr.com/ru/", # Path is just meta data. It uses for nothing
linter={
"type": "LIST",
"tag": "li",
"attrs": {"class": "content-list__item"},
"children": {
"type": "ELEMENT",
"tag": "h2",
"attrs": {"class": "post__title"},
"children": {
"type": "ELEMENT",
"tag": "a",
"attrs": {"class": "post__title_link"}
}
}
}
)
return response
main.py
import requests
from roboparse import Parser
from .routers import HabrRouter, HabrRouterNews
parser: Parser = Parser()
def scrape_news1():
router = HabrRouterNews(username="username", password="password")
with requests.Session() as session:
html = session.get("url")
data = parser.load(html, router.get())
sorted_data = router.filter_data(data)
print(sorted_data)
def scrape_news2():
router = HabrRouter(username="username", password="password")
with requests.Session() as session:
html = session.get("url")
data = parser.load(html, router.get_news())
print(data)
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
roboparse-0.0.1.tar.gz
(5.0 kB
view details)
Built Distribution
File details
Details for the file roboparse-0.0.1.tar.gz
.
File metadata
- Download URL: roboparse-0.0.1.tar.gz
- Upload date:
- Size: 5.0 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/3.4.1 importlib_metadata/4.0.1 pkginfo/1.7.0 requests/2.25.1 requests-toolbelt/0.9.1 tqdm/4.61.0 CPython/3.8.0
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | e18721ad1278856333b3a7793755322f516f6ffd4f757a87e71e19612b3c02f3 |
|
MD5 | 21835649044d4783fe694a0ae486fc7b |
|
BLAKE2b-256 | f0b87b0acc50a8190b3e134df06ce2ef80117449775ac847eb7958fbacfc34db |
File details
Details for the file roboparse-0.0.1-py3-none-any.whl
.
File metadata
- Download URL: roboparse-0.0.1-py3-none-any.whl
- Upload date:
- Size: 7.0 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/3.4.1 importlib_metadata/4.0.1 pkginfo/1.7.0 requests/2.25.1 requests-toolbelt/0.9.1 tqdm/4.61.0 CPython/3.8.0
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | b6f57d4dd112bb8941310a30ee2be985aa27368ec2dc8441ae8223ea1d48f7bf |
|
MD5 | bdd7edc485d8d3e430843e9697f1afeb |
|
BLAKE2b-256 | d370ae136f7e2567b9158d912060f88ebbca5d9c0be3dec3fa3fb2c961a5b863 |