A simple web-crawling framework, based on aiohttp.
Project description
🔍 A powerful web-crawling framework, based on aiohttp.
Feature
Write your crawler in one Python script with asyncio
Schedule task with priority, fingerprint, exetime, recrawl…
Middleware: add handlers before or after tasks
Simple shortcuts to speed up scripting
Parse html conveniently with Parsel
Support JavaScript/browser-automation with pyppeteer
Stop and Resume: crawl periodically and persistently
Distributed work support with Redis
Installation
To install, simply use pipenv (or pip):
$ pipenv install acrawler
(Optional)
$ pipenv install uvloop #(only Linux/macOS, for faster asyncio event loop)
$ pipenv install aioredis #(if you need Redis support)
$ pipenv install motor #(if you need MongoDB support)
Documentation
Documentation and tutorial are available online at https://acrawler.readthedocs.io/ and in the docs directory.
Sample Code
Scrape quotes.toscrape.com
# Scrape quotes from http://quotes.toscrape.com/
from acrawler import Parser, Crawler, ParselItem, Request
logger = get_logger("quotes")
class QuoteItem(ParselItem):
log = True
default_rules = {"type": "quote"}
css_rules_first = {"author": "small.author::text"}
xpath_rules_first = {"text": './/span[@class="text"]/text()'}
field_processors = {"text": lambda s: s.strip("“")[:20]}
class AuthorItem(ParselItem):
log = True
default_rules = {"type": "author"}
css_rules_first = {
"name": "h3.author-title::text",
"born": "span.author-born-date::text",
}
class QuoteCrawler(Crawler):
main_page = r"quotes.toscrape.com/page/\d+"
author_page = r"quotes.toscrape.com/author/.*"
parsers = [
Parser(
in_pattern=main_page,
follow_patterns=[main_page, author_page],
item_type=QuoteItem,
css_divider=".quote",
),
Parser(in_pattern=author_page, item_type=AuthorItem),
]
async def start_requests(self):
yield Request(url="http://quotes.toscrape.com/page/1/")
if __name__ == "__main__":
QuoteCrawler().run()
Scrape v2ex.com
from acrawler import Crawler, Request, register
class V2EXCrawler(Crawler):
def start_requests(self):
yield Request(
url="https://www.v2ex.com/?tab=hot",
callback=self.parse_hot,
recrawl=5,
links_to_abs=True,
)
def parse(self, response):
print(
"This is default callback function! Auto-combined to any request generated by start_requests()."
)
def parse_hot(self, response):
aa = response.sel.css(".item_title a")
for a in aa:
d = {
"url": response.urljoin(a).split("#")[0],
"title": a.css("::text").get(),
}
yield d
@register(family="DefaultItem")
def process_d(d):
print(d.content)
if __name__ == "__main__":
V2EXCrawler().run()
Scrape imdb.com
from acrawler import Crawler, Request, ParselItem, Handler, register, get_logger
def process_time(value):
# a self-defined field processing function
# process time to minutes
# '3h 1min' -> 181
if value:
res = 0
segs = value.split(" ")
for seg in segs:
if seg.endswith("min"):
res += int(seg.replace("min", ""))
elif seg.endswith("h"):
res += 60 * int(seg.replace("h", ""))
return res
else:
return value
class MovieItem(ParselItem):
css_rules_first = {
"title": "h1::text",
"date": ".subtext a[href*=releaseinfo]::text",
"time": ".subtext time::text",
"rating": "span[itemprop=ratingValue]::text",
"rating_count": "span[itemprop=ratingCount]::text",
"metascore": ".metacriticScore span::text",
}
css_rules = {
"genres": ".subtext a[href*=genres]::text",
"director": "h4:contains(Director) ~ a[href*=name]::text",
"writers": "h4:contains(Writer) ~ a[href*=name]::text",
"stars": "h4:contains(Star) ~ a[href*=name]::text",
}
field_processors = {"time": process_time}
class IMDBCrawler(Crawler):
config = {"MAX_REQUESTS": 4, "DOWNLOAD_DELAY": 1}
async def start_requests(self):
yield Request("https://www.imdb.com/chart/moviemeter", links_to_abs=True)
async def parse(self, response):
for tr in response.sel.css(".lister-list tr"):
link = tr.css(".titleColumn a::attr(href)").get()
if link:
yield Request(link, callback=self.parse_movie)
async def parse_movie(self, response):
url = response.url_str
yield MovieItem(response.sel, extra={"url": url.split("?")[0]})
@register()
class HorrorHandler(Handler):
family = "MovieItem"
logger = get_logger("horrorlog")
async def handle_after(self, item):
if item["genres"] and "Horror" in item["genres"]:
self.logger.warning("({}) is a horror movie!!!!".format(item["title"]))
yield {"singal": "Leaving...", "title": item["title"]}
@register("DefaultItem")
def print_item(item):
print(item.content)
if __name__ == "__main__":
IMDBCrawler().run()
See examples.
Todo
Fix default fingerprint functions for request
Add delta_key support for request
Cralwer’s name for distinguishing
Command Line config support
Promethues monitor as command
Monitor all crawlers in web
Write detailed Documentation
Write testing code
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.