Modern user friendly web automation and scraping library.
Project description
Programmatic web browser/crawler in Python. Alternative to Mechanize, RoboBrowser, MechanicalSoup and others. Strict power of Request and Lxml. Some features and methods usefull in scraping “out of the box”.
Install
$ pip install delver
Documentation
Quick start - usage examples
Basic examples
Form submit
>>> from delver import Crawler
>>> c = Crawler()
>>> response = c.open('https://httpbin.org/forms/post')
>>> forms = c.forms()
# Filling up fields values:
>>> form = forms[0]
>>> form.fields = {
... 'custname': 'Ruben Rybnik',
... 'custemail': 'ruben.rybnik@fakemail.com',
... 'size': 'medium',
... 'topping': ['bacon', 'cheese'],
... 'custtel': '+48606505888'
... }
>>> submit_result = c.submit(form)
>>> submit_result.status_code
200
# Checking if form post ended with success:
>>> c.submit_check(
... form,
... phrase="Ruben Rybnik",
... url='https://httpbin.org/forms/post',
... status_codes=[200]
... )
True
Find links narrowed by filters
>>> c = Crawler()
>>> c.open('https://httpbin.org/links/10/0')
<Response [200]>
# Links can be filtered by some html tags and filters
# like: id, text, title and class:
>>> links = c.links(
... tags = ('style', 'link', 'script', 'a'),
... filters = {
... 'text': '7'
... },
... match='NOT_EQUAL'
... )
>>> len(links)
8
Download file
>>> import os
>>> c = Crawler()
>>> local_file_path = c.download(
... local_path='test',
... url='https://httpbin.org/image/png',
... name='test.png'
... )
>>> os.path.isfile(local_file_path)
True
Download files list in parallel
>>> c = Crawler()
>>> c.open('https://xkcd.com/')
<Response [200]>
>>> full_images_urls = [c.join_url(src) for src in c.images()]
>>> downloaded_files = c.download_files('test', files=full_images_urls)
>>> len(full_images_urls) == len(downloaded_files)
True
Xpath selectors
c = Crawler()
c.open('https://httpbin.org/html')
p_text = c.xpath('//p/text()')
Css selectors
c = Crawler()
c.open('https://httpbin.org/html')
p_text = c.css('div')
Xpath result with filters
c = Crawler()
c.open('https://www.w3schools.com/')
filtered_results = c.xpath('//p').filter(filters={'class': 'w3-xlarge'})
Using retries
c = Crawler()
# sets max_retries to 2 means that after there will be max two attempts to open url
# if first attempt will fail, wait 1 second and try again, second attempt wait 2 seconds
# and then try again
c.max_retries = 2
c.open('http://www.delver.cg/404')
Use examples
Scraping Steam Specials using XPath
from pprint import pprint
from delver import Crawler
c = Crawler(absolute_links=True)
c.logging = True
c.useragent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
c.random_timeout = (0, 5)
c.open('http://store.steampowered.com/search/?specials=1')
titles, discounts, final_prices = [], [], []
while c.links(filters={
'class': 'pagebtn',
'text': '>'
}):
c.open(c.current_results[0])
titles.extend(
c.xpath("//div/span[@class='title']/text()")
)
discounts.extend(
c.xpath("//div[contains(@class, 'search_discount')]/span/text()")
)
final_prices.extend(
c.xpath("//div[contains(@class, 'discounted')]//text()[2]").strip()
)
all_results = {
row[0]: {
'discount': row[1],
'final_price': row[2]
} for row in zip(titles, discounts, final_prices)}
pprint(all_results)
Simple tables scraping out of the box
from pprint import pprint
from delver import Crawler
c = Crawler(absolute_links=True)
c.logging = True
c.useragent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
c.open("http://www.boxofficemojo.com/daily/")
pprint(c.tables())
User login
from delver import Crawler
c = Crawler()
c.useragent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/60.0.3112.90 Safari/537.36"
)
c.random_timeout = (0, 5)
c.open('http://testing-ground.scraping.pro/login')
forms = c.forms()
if forms:
login_form = forms[0]
login_form.fields = {
'usr': 'admin',
'pwd': '12345'
}
c.submit(login_form)
success_check = c.submit_check(
login_form,
phrase='WELCOME :)',
status_codes=[200]
)
print(success_check)
One Punch Man Downloader
import os
from delver import Crawler
class OnePunchManDownloader:
"""Downloads One Punch Man free manga chapers to local directories.
Uses one main thread for scraper with random timeout.
Uses 20 threads just for image downloads.
"""
def __init__(self):
self._target_directory = 'one_punch_man'
self._start_url = "http://m.mangafox.me/manga/onepunch_man_one/"
self.crawler = Crawler()
self.crawler.random_timeout = (0, 5)
self.crawler.useragent = "Googlebot-Image/1.0"
def run(self):
self.crawler.open(self._start_url)
for link in self.crawler.links(filters={'text': 'Ch '}, match='IN'):
self.download_images(link)
def download_images(self, link):
target_path = '{}/{}'.format(self._target_directory, link.split('/')[-2])
full_chapter_url = link.replace('/manga/', '/roll_manga/')
self.crawler.open(full_chapter_url)
images = self.crawler.xpath("//img[@class='reader-page']/@data-original")
os.makedirs(target_path, exist_ok=True)
self.crawler.download_files(target_path, files=images, workers=20)
downloader = OnePunchManDownloader()
downloader.run()
History
0.1.3 (2017-10-03)
First release on PyPI.
Project details
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
delver-0.1.5.1.tar.gz
(33.3 kB
view details)
Built Distribution
File details
Details for the file delver-0.1.5.1.tar.gz
.
File metadata
- Download URL: delver-0.1.5.1.tar.gz
- Upload date:
- Size: 33.3 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 62228d7ffd53c3543a9967277a356511e08f3870f71a154816571fcfed51c171 |
|
MD5 | f286f2181dc5b945ea74f87333747a66 |
|
BLAKE2b-256 | 10a568e12094e286e19272270087a62ed3a4860cb3c19d5c52a821b38c783769 |
File details
Details for the file delver-0.1.5.1-py2.py3-none-any.whl
.
File metadata
- Download URL: delver-0.1.5.1-py2.py3-none-any.whl
- Upload date:
- Size: 25.6 kB
- Tags: Python 2, Python 3
- Uploaded using Trusted Publishing? No
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | c14e46d261f7005b0c8473bb96457c671000e19119fc8168fa2d784c04b6efc7 |
|
MD5 | dd437d72018352b83a2ef88258bbbd25 |
|
BLAKE2b-256 | c41875a4936b173db57880db290590b3d18138a2d0711bb7216d60955f432889 |