Skip to main content

Async scraping library

Project description

A scraping library on top of aiohttp and parsechain. Note that this is alpha software.

Installation

pip install aioscrape

Usage

from aioscrape import run, fetch, settings
from aioscrape.middleware import last_fetch, make_filecache
from aioscrape.utils import SOME_HEADERS # To not look like a bot

from urllib.parse import urljoin
from parsechain import C
from funcy import lcat, lconcat


def main():
    # Settings are scoped and can be redefined later with another "with"
    cache = make_filecache('.fcache')
    with settings(headers=SOME_HEADERS, middleware=[cache, last_fetch]):
        print(run(scrape_all()))


async def scrape_all():
    # All the settings in scope like headers and middleware are applied to fetch()
    start_page = await fetch(START_URL)

    # AioScrape integrates with parsechain to make extracting a breeze
    urls = start_page.css('.pagingLinks a').attrs('href')
    list_urls = [urljoin(start_page.url, page_url) for page_url in urls]

    # Using asyncio.wait() and friends to run requests in parallel
    list_pages = [start_page] + await wait_all(map(fetch, list_urls))

    # Scrape articles
    result = lcat(await wait_all(map(scrape_articles, list_pages)))
    write_to_csv('export.csv', result)


async def scrape_articles(list_page):
    urls = list_page.css('#headlines .titleLink').attrs('href')
    abs_urls = [urljoin(list_page.url, url) for url in urls]
    return await wait_all(map(scrape_article, abs_urls))


async def scrape_article(url):
    resp = await fetch(url)
    return resp.root.multi({
        'url': C.const(resp.url),
        'title': C.microdata('headline').first,
        'date': C.microdata('datePublished').first,
        'text': C.microdata('articleBody').first,
        'contacts': C.css('.sidebars .contact p')
                     .map(C.inner_html + html_to_text) + lconcat + ''.join,
    })


if __name__ == '__main__':
    main()

TODO

  • Response.follow()
  • non-GET requests
  • work with forms

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Files for aioscrape, version 0.0.2
Filename, size File type Python version Upload date Hashes
Filename, size aioscrape-0.0.2-py2.py3-none-any.whl (9.0 kB) File type Wheel Python version 2.7 Upload date Hashes View hashes
Filename, size aioscrape-0.0.2.tar.gz (6.5 kB) File type Source Python version None Upload date Hashes View hashes

Supported by

Elastic Elastic Search Pingdom Pingdom Monitoring Google Google BigQuery Sentry Sentry Error logging AWS AWS Cloud computing DataDog DataDog Monitoring Fastly Fastly CDN DigiCert DigiCert EV certificate StatusPage StatusPage Status page