this is not a awesome description
Project description
EasyCrawl
Code Example
from pprint import pprint
import requests
from parsel import Selector
from turbocrawler import Crawler, CrawlerRequest, CrawlerResponse, CrawlerRunner
class QuotesToScrapeCrawler(Crawler):
crawler_name = "QuotesToScrape"
allowed_domains = ['quotes.toscrape']
regex_rules = [r'https://quotes.toscrape.com/page/[0-9]']
time_between_requests = 1
session: requests.Session
def start_crawler(self) -> None:
self.session = requests.session()
def crawler_first_request(self) -> CrawlerResponse:
url = "https://quotes.toscrape.com/page/1/"
response = self.session.get(url=url)
return CrawlerResponse(site_url=response.url,
site_body=response.text,
status_code=response.status_code)
def process_request(self, crawler_request: CrawlerRequest) -> CrawlerResponse:
response = self.session.get(crawler_request.site_url)
return CrawlerResponse(site_url=response.url,
site_body=response.text,
status_code=response.status_code)
def parse_crawler_response(self, crawler_response: CrawlerResponse) -> None:
selector = Selector(crawler_response.site_body)
quote_list = selector.css('div[class="quote"]')
for quote in quote_list:
data = {"quote": quote.css('span:nth-child(1)::text').get()[1:-1],
"author": quote.css('span:nth-child(2)>small::text').get(),
"tags_list": quote.css('div[class="tags"]>a::text').getall()}
pprint(data)
def stop_crawler(self) -> None:
self.session.close()
CrawlerRunner(crawler=QuotesToScrapeCrawler).run()
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
turbocrawler-0.0.1rc2.tar.gz
(19.1 kB
view hashes)
Built Distribution
Close
Hashes for turbocrawler-0.0.1rc2-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 735fea7507ecd6d8d21a73eed61ce2780246a2fabafbdb016c1a328192327494 |
|
MD5 | b68400b37adfaa810d270bca98022784 |
|
BLAKE2b-256 | 58dac52a785f8f32f33b6570d9b75b02e167398367d84197070b4c4a2ef69457 |