Skip to main content

A simple web-crawling framework, based on aiohttp.

Project description

PyPI Documentation Status

🔍 A powerful web-crawling framework, based on aiohttp.

Feature

  • Write your crawler in one Python script with asyncio

  • Schedule task with priority, fingerprint, exetime, recrawl…

  • Middleware: add handlers before or after tasks

  • Simple shortcuts to speed up scripting

  • Parse html conveniently with Parsel

  • Support JavaScript/browser-automation with pyppeteer

  • Stop and Resume: crawl periodically and persistently

  • Distributed work support with Redis

Installation

To install, simply use pipenv (or pip):

$ pipenv install acrawler

(Optional)
$ pipenv install uvloop      #(only Linux/macOS, for faster asyncio event loop)
$ pipenv install aioredis    #(if you need Redis support)
$ pipenv install motor       #(if you need MongoDB support)

Documentation

Documentation and tutorial are available online at https://acrawler.readthedocs.io/ and in the docs directory.

Sample Code

Scrape quotes.toscrape.com

# Scrape quotes from http://quotes.toscrape.com/
from acrawler import Parser, Crawler, ParselItem, Request


logger = get_logger("quotes")


class QuoteItem(ParselItem):
   log = True
   default_rules = {"type": "quote"}
   css_rules_first = {"author": "small.author::text"}
   xpath_rules_first = {"text": './/span[@class="text"]/text()'}

   field_processors = {"text": lambda s: s.strip("“")[:20]}


class AuthorItem(ParselItem):
   log = True
   default_rules = {"type": "author"}
   css_rules_first = {
      "name": "h3.author-title::text",
      "born": "span.author-born-date::text",
   }


class QuoteCrawler(Crawler):

   main_page = r"quotes.toscrape.com/page/\d+"
   author_page = r"quotes.toscrape.com/author/.*"
   parsers = [
      Parser(
            in_pattern=main_page,
            follow_patterns=[main_page, author_page],
            item_type=QuoteItem,
            css_divider=".quote",
      ),
      Parser(in_pattern=author_page, item_type=AuthorItem),
   ]

   async def start_requests(self):
      yield Request(url="http://quotes.toscrape.com/page/1/")


if __name__ == "__main__":
   QuoteCrawler().run()

Scrape v2ex.com

from acrawler import Crawler, Request, register


class V2EXCrawler(Crawler):
   def start_requests(self):
      yield Request(
            url="https://www.v2ex.com/?tab=hot",
            callback=self.parse_hot,
            recrawl=5,
            links_to_abs=True,
      )

   def parse(self, response):
      print(
            "This is default callback function! Auto-combined to any request generated by start_requests()."
      )

   def parse_hot(self, response):
      aa = response.sel.css(".item_title a")
      for a in aa:
            d = {
               "url": response.urljoin(a).split("#")[0],
               "title": a.css("::text").get(),
            }
            yield d


@register(family="DefaultItem")
def process_d(d):
   print(d.content)


if __name__ == "__main__":
   V2EXCrawler().run()

Scrape imdb.com

from acrawler import Crawler, Request, ParselItem, Handler, register, get_logger


def process_time(value):
   # a self-defined field processing function
   # process time to minutes
   # '3h 1min' -> 181
   if value:
      res = 0
      segs = value.split(" ")
      for seg in segs:
            if seg.endswith("min"):
               res += int(seg.replace("min", ""))
            elif seg.endswith("h"):
               res += 60 * int(seg.replace("h", ""))
      return res
   else:
      return value


class MovieItem(ParselItem):
   css_rules_first = {
      "title": "h1::text",
      "date": ".subtext a[href*=releaseinfo]::text",
      "time": ".subtext time::text",
      "rating": "span[itemprop=ratingValue]::text",
      "rating_count": "span[itemprop=ratingCount]::text",
      "metascore": ".metacriticScore span::text",
   }

   css_rules = {
      "genres": ".subtext a[href*=genres]::text",
      "director": "h4:contains(Director) ~ a[href*=name]::text",
      "writers": "h4:contains(Writer) ~ a[href*=name]::text",
      "stars": "h4:contains(Star) ~ a[href*=name]::text",
   }

   field_processors = {"time": process_time}


class IMDBCrawler(Crawler):
   config = {"MAX_REQUESTS": 4, "DOWNLOAD_DELAY": 1}

   async def start_requests(self):
      yield Request("https://www.imdb.com/chart/moviemeter", links_to_abs=True)

   async def parse(self, response):
      for tr in response.sel.css(".lister-list tr"):
            link = tr.css(".titleColumn a::attr(href)").get()
            if link:
               yield Request(link, callback=self.parse_movie)


   async def parse_movie(self, response):
      url = response.url_str
      yield MovieItem(response.sel, extra={"url": url.split("?")[0]})


@register()
class HorrorHandler(Handler):
   family = "MovieItem"
   logger = get_logger("horrorlog")

   async def handle_after(self, item):
      if item["genres"] and "Horror" in item["genres"]:
            self.logger.warning("({}) is a horror movie!!!!".format(item["title"]))

            yield {"singal": "Leaving...", "title": item["title"]}


@register("DefaultItem")
def print_item(item):
   print(item.content)


if __name__ == "__main__":
   IMDBCrawler().run()

See examples.

Todo

  • Fix default fingerprint functions for request

  • Add delta_key support for request

  • Cralwer’s name for distinguishing

  • Command Line config support

  • Promethues monitor as command

  • Monitor all crawlers in web

  • Write detailed Documentation

  • Write testing code

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

acrawler-0.0.9.tar.gz (31.1 kB view hashes)

Uploaded source

Supported by

AWS AWS Cloud computing Datadog Datadog Monitoring Facebook / Instagram Facebook / Instagram PSF Sponsor Fastly Fastly CDN Google Google Object Storage and Download Analytics Huawei Huawei PSF Sponsor Microsoft Microsoft PSF Sponsor NVIDIA NVIDIA PSF Sponsor Pingdom Pingdom Monitoring Salesforce Salesforce PSF Sponsor Sentry Sentry Error logging StatusPage StatusPage Status page