Skip to main content

分类扒数据的简易框架

Project description

##Example.py

# -- coding: utf-8 -- from pyquery import PyQuery as pq

from newspider.interfaces import * from newspider.spider import Newspider

class DemoFetcher(IntFetcher):
def __init__(self):
self.next_page = []
def fetch_detail_urls(self,html):

d = pq(html) list = []

for a in d(‘.post-title a’):
url = d(a).attr(‘href’) extras = {“category”: “Test for %s” % url} list.append((url,extras))
for l in d(‘.page-navigator a’):
self.next_page.append(d(l).attr(‘href’))

return list

def start_page(self):
return [‘http://www.typechodev.com/’,’http://www.typechodev.com/index.php/category/questions/’]
def next_pages(self):
return self.next_page
class DemoParser(IntParser):
def parse(self,tag,html,extras):
print “Receive content from url %s for tag %s|%s” % (extras.get(‘_url’),extras.get(‘category’),tag)
if __name__ == ‘__main__’:

sp = Newspider() sp.config(‘GUARD_INTERVAL’, 0)

sp.add_parser(DemoParser()) sp.add_fetcher(DemoFetcher())

sp.run()

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Files for newspider, version 0.9.9
Filename, size File type Python version Upload date Hashes
Filename, size newspider-0.9.9.tar.gz (8.0 kB) File type Source Python version None Upload date Hashes View

Supported by

Pingdom Pingdom Monitoring Google Google Object Storage and Download Analytics Sentry Sentry Error logging AWS AWS Cloud computing DataDog DataDog Monitoring Fastly Fastly CDN DigiCert DigiCert EV certificate StatusPage StatusPage Status page