轻量级的爬虫框架,支持中间件、检验等功能
Project description
项目说明
- 轻量框架,支持中间件、检验等功能。用法与Scrapy、Feapder类似。
Python解释器
- python3
如何使用pader?
使用SlowSpider
- 单线程爬虫
from loguru import logger
import pader
class TestSpider(pader.SlowSpider):
start_urls = ['https://www.baidu.com']
def when_spider_start(self):
print('爬虫开始了...')
def when_spider_close(self):
print('...爬虫结束了')
def parse(self, request, response):
lis = response.xpath('//ul[@id="hotsearch-content-wrapper"]/li')
for li in lis:
url = li.xpath('./a/@href').get()
title = li.xpath('./a/span[last()]/text()').get()
logger.success(title)
logger.success(url)
logger.info('\r')
yield pader.Request(url, callback=self.parse_detail)
def parse_detail(self, request, response):
nodes = response.xpath('//div[@class="c-container"]//h3')
for node in nodes:
some = node.xpath('./a//text()').getall()
title = ''.join(some)
url = node.xpath('./a/@href').get()
logger.success(title)
logger.success(url)
def middleware(self, request):
request.mark = '百度首页' if request.callback.__name__ == 'parse' else '百度搜索页'
logger.info('进入了中间件,已设置记号为{}'.format(request.mark))
def validate(self, request, response):
logger.warning('进入了校验,记号={}'.format(request.mark))
if __name__ == '__main__':
TestSpider().crawl()
使用PaderSpider
- 多线程爬虫
import threading
import time
from loguru import logger
import pader
def t_name():
return threading.current_thread().name
def show(request):
logger.success("回调: {} => 线程: {}".format(request.callback.__name__, t_name()))
URL = "https://www.baidu.com/s?&wd=python3"
class TestSpider(pader.PaderSpider):
def start_requests(self):
for i in range(5):
yield pader.Request(URL)
def when_spider_start(self):
logger.info('爬虫开始了...')
def when_spider_close(self):
logger.info('...爬虫结束了')
def parse(self, request, response):
show(request)
for i in range(2):
mark = 'parse-{}'.format(i + 1)
yield pader.Request(URL, mark=mark, callback=self.parse_list)
def parse_list(self, request, response):
show(request)
for i in range(3):
mark = 'parse_list-{}'.format(i + 1)
yield pader.Request(URL, mark=mark, callback=self.parse_detail)
def parse_detail(self, request, response):
show(request)
def middleware(self, request):
time.sleep(1) # 睡眠1S方便看出并发效果
if __name__ == '__main__':
TestSpider(speed=5, qsize=10).crawl()
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
pader-0.3.tar.gz
(6.4 kB
view details)
File details
Details for the file pader-0.3.tar.gz
.
File metadata
- Download URL: pader-0.3.tar.gz
- Upload date:
- Size: 6.4 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/5.0.0 CPython/3.10.13
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 9b5b957867a3203f7f46e074190a4c00177acc16798bed8ca06dc612c46abc7c |
|
MD5 | 6f8badc00648765a932a1b7ad682f292 |
|
BLAKE2b-256 | 2508e143b1167a1dedd432cd19df25716fb9ab93082f529252878ed742affb7e |