Async scraping library
Project description
A scraping library on top of aiohttp and parsechain. Note that this is alpha software.
Installation
pip install aioscrape
Usage
from aioscrape import run, fetch, settings from aioscrape.middleware import last_fetch, make_filecache from aioscrape.utils import SOME_HEADERS # To not look like a bot from urllib.parse import urljoin from parsechain import C from funcy import lcat, lconcat def main(): # Settings are scoped and can be redefined later with another "with" cache = make_filecache('.fcache') with settings(headers=SOME_HEADERS, middleware=[cache, last_fetch]): print(run(scrape_all())) async def scrape_all(): # All the settings in scope like headers and middleware are applied to fetch() start_page = await fetch(START_URL) # AioScrape integrates with parsechain to make extracting a breeze urls = start_page.css('.pagingLinks a').attrs('href') list_urls = [urljoin(start_page.url, page_url) for page_url in urls] # Using asyncio.wait() and friends to run requests in parallel list_pages = [start_page] + await wait_all(map(fetch, list_urls)) # Scrape articles result = lcat(await wait_all(map(scrape_articles, list_pages))) write_to_csv('export.csv', result) async def scrape_articles(list_page): urls = list_page.css('#headlines .titleLink').attrs('href') abs_urls = [urljoin(list_page.url, url) for url in urls] return await wait_all(map(scrape_article, abs_urls)) async def scrape_article(url): resp = await fetch(url) return resp.root.multi({ 'url': C.const(resp.url), 'title': C.microdata('headline').first, 'date': C.microdata('datePublished').first, 'text': C.microdata('articleBody').first, 'contacts': C.css('.sidebars .contact p') .map(C.inner_html + html_to_text) + lconcat + ''.join, }) if __name__ == '__main__': main()
TODO
- Response.follow()
- non-GET requests
- work with forms
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
aioscrape-0.0.2.tar.gz
(6.5 kB
view hashes)
Built Distribution
Close
Hashes for aioscrape-0.0.2-py2.py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 80e16ade3c66d571de176a59b44eba5b5762868dcb380839eed31a7d5dc8c6b7 |
|
MD5 | 1b9ee39a8e5e858f99a263390725143b |
|
BLAKE2-256 | b0b64e86c9d78cf1716e451d3cb70d2ff0e1ac48d2448ba01e81b3653a3c128d |