分类扒数据的简易框架
Project description
##Example.py
# -- coding: utf-8 -- from pyquery import PyQuery as pq
from newspider.interfaces import * from newspider.spider import Newspider
- class DemoFetcher(IntFetcher):
- def __init__(self):
self.next_page = []
- def fetch_detail_urls(self,html):
d = pq(html) list = []
- for a in d(‘.post-title a’):
url = d(a).attr(‘href’) extras = {“category”: “Test for %s” % url} list.append((url,extras))
- for l in d(‘.page-navigator a’):
self.next_page.append(d(l).attr(‘href’))
return list
- def start_page(self):
return [’http://www.typechodev.com/’,’http://www.typechodev.com/index.php/category/questions/’]
- def next_pages(self):
return self.next_page
- class DemoParser(IntParser):
- def parse(self,tag,html,extras):
print “Receive content from url %s for tag %s|%s” % (extras.get(‘_url’),extras.get(‘category’),tag)
- if __name__ == ‘__main__’:
sp = Newspider() sp.config(‘GUARD_INTERVAL’, 0)
sp.add_parser(DemoParser()) sp.add_fetcher(DemoFetcher())
sp.run()
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
File details
Details for the file newspider-0.9.9.tar.gz
.
File metadata
- Download URL: newspider-0.9.9.tar.gz
- Upload date:
- Size: 8.0 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | ec8f628be4c38c9b4aa9e13a38d6ab1e462f9532695262c36b47ae3418a2d109 |
|
MD5 | e11f816e342dda8c2adf3c4df2f82bff |
|
BLAKE2b-256 | 5fb386af61ea05bccee02f2658de289ef087cbd71c26a0f7b35e31223414bd4c |