Skip to main content

分类扒数据的简易框架

Project description

##Example.py

# -- coding: utf-8 -- from pyquery import PyQuery as pq

from newspider.interfaces import * from newspider.spider import Newspider

class DemoFetcher(IntFetcher):
def __init__(self):

self.next_page = []

def fetch_detail_urls(self,html):

d = pq(html) list = []

for a in d(‘.post-title a’):

url = d(a).attr(‘href’) extras = {“category”: “Test for %s” % url} list.append((url,extras))

for l in d(‘.page-navigator a’):

self.next_page.append(d(l).attr(‘href’))

return list

def start_page(self):

return [’http://www.typechodev.com/’,’http://www.typechodev.com/index.php/category/questions/’]

def next_pages(self):

return self.next_page

class DemoParser(IntParser):
def parse(self,tag,html,extras):

print “Receive content from url %s for tag %s|%s” % (extras.get(‘_url’),extras.get(‘category’),tag)

if __name__ == ‘__main__’:

sp = Newspider() sp.config(‘GUARD_INTERVAL’, 0)

sp.add_parser(DemoParser()) sp.add_fetcher(DemoFetcher())

sp.run()

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

newspider-0.9.8.tar.gz (7.9 kB view hashes)

Uploaded Source

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page