Skip to main content

一个高级爬虫框架

Project description

ms_spider爬虫框架的使用介绍

作者:Musen

一、配置文档

1、浏览器配置项

# 浏览器类型
BROWSER: str = 'chrome'  
# 默认为Flase,如果浏览器需要使用本地登录态,则设置为True,并配置浏览器路径和缓存文件路径
IS_LOCAL_BROWSER: bool = False
# 浏览器路径
BROWSER_PATH: str = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
    
#  浏览器用户缓存文件路径
USER_DIR: str = r'C:\Users\zengyanzhi\AppData\Local\Google\Chrome\User Data'
    
# 是否启用调试模式(使用本地的chrome浏览器)
DEBUG = False
# 浏览器远程调试端口()
PORT = 19789




# 是否加载图片(关闭可提升数据抓取效率)
IS_LOAD_IMAGE = True
# 是否使用无头模式
IS_HEADLESS: bool = True
# 翻页操作间隔时间(控制抓取频率,防止反爬)
TIME_INTERVAL: int = random.randint(1, 3)
    
    

2、通用爬虫配置项

# 初始启动的url地址
start_url: str = ''
# 页面数据列表的定位表达式(css或xpath均支持)
data_list_loc: str = ''

3、自动翻页爬虫

# 自动翻页,下一页按钮的定位表达式(css或xpath均支持)
next_page_btn_loc: str = ""
# 下一页按钮距离页面底部的距离
next_button_distance: int = 200
# 数据分割的标识符(一般不用)
split_str: str = '\n'
# 抓取多少页
pages: int = 1
# 要提取的字段:{key:[v1,v2]}
# key为保存的字段名称,v1为提取的属性,v2为定位表达式(css或xpath均支持)
data_extract_loc = {
        'score': ('text', '//span[@class="real font-bold"]'),
        'name': ('text', '//span[@class="name font-bold"]'),
        'price': ('text', '//span[@class="real-price font-bold"]'),
    }
  • 案例:大众点评

    class DZDPSpider(BasePageCrawler):
        """大众点评爬虫"""
        DEBUG = True
        start_url = 'https://www.dianping.com/changsha/ch10/g112'
        data_list_loc = '//*[@id="shop-all-list"]/ul/li'
        next_page_btn_loc = '//a[text()="下一页"]'
        next_button_distance = 200
        split_str = '\n'
        pages = 2
        data_extract_loc = {
            'url': ('href', '//div[@class="tit"]/a[1]'),
            'name': ('text', '//div[@class="tit"]/a/h4'),
            'price': ('text', '//a[@class="mean-price"]'),
            'recommend': ('text', '//div[@class="recommend"]'),
        }
    

4、滚动点击动态加载爬虫

# 动态加载更多的按钮
loader_more_loc = '//div[@class="list-btn-more"]/div'
# 加载的次数(如果加载所有数据会自动停止)
loaders = 20
  • 案例:

class XCJDSpider(ScrollLoaderSpider):
    """携程酒店数据抓取"""
    DEBUG = True
    data_list_loc = '//li[@class="list-item-target"]'
    loader_more_loc = '//div[@class="list-btn-more"]/div'
    loaders = 20
    data_extract_loc = {
        'score': ('text', '//span[@class="real font-bold"]'),
        'name': ('text', '//span[@class="name font-bold"]'),
        'price': ('text', '//span[@class="real-price font-bold"]'),
    }
    start_url = 'https://hotels.ctrip.com/hotels/list?countryId=1&city=4&checkin=2024/05/01&checkout=2024/05/03&optionId=4&optionType=City&directSearch=0&display=%E9%87%8D%E5%BA%86&crn=1&adult=1&children=0&searchBoxArg=t&travelPurpose=0&ctm_ref=ix_sb_dl&domestic=1&&highPrice=-1&barCurr=CNY&hotPoi=50%7C50%7C4197113&sort=9'

5、深度爬虫

# 深度页面打开的间隔时间(需要控制好,不然容易反爬)
open_page_interval: int = 3
# 深度url链接的提取规则
deep_link_url: str = ''
# 深度页面数据提取规则
deep_data_extract_loc: dict = {}
  • 案例:

class DZDPSpider(DeepPageCrawler):
    """大众点评深度爬虫"""
    # 是否使用本地chrome
    DEBUG = True
    open_page_interval = 5
    # 起始url
    start_url = 'https://www.dianping.com/search/keyword/344/0_%E7%BE%8E%E5%AE%B9'
    # 数据列表
    data_list_loc = '//div[@id="shop-all-list"]//ul/li'
    # 下一页按钮
    next_page_btn_loc = "//a[text()='下一页']"
    # 下一页距离底部距离
    next_button_distance = 200
    # 抓取页数
    pages = 3
    # 列表页数据提取规则
    data_extract_loc = {
        'url': ('href', '//div[@class="tit"]/a[1]'),
        'name': ('text', '//div[@class="tit"]/a/h4'),
        'price': ('text', '//a[@class="mean-price"]'),
    }
    # ==============深度页面抓取=============
    # 深度抓取url提取规则
    deep_link_url = '//div[@class="tit"]/a[1]'
    # 深度抓取数据提取规则
    deep_data_extract_loc = {
        'addr': ('text', '//div[@class="expand-info address"]'),
        "mobile": ('text', '//p[@class="expand-info tel"]')
    }

二、视频图片爬虫案例

1、图片爬虫

from spider.media_spider import ImagesSpider


class MZDPSpider(ImagesSpider):
    """图片下载"""
    DEBUG = True
    # 页面地址
    start_url = 'https://www.pexels.com/zh-cn/'
    # 图片文件的前缀
    image_start_path = "https://images.pexels.com/photos"
    # 图片保存的路径
    image_save_path = 'D:\projectCode\MusenSpider\images'

    def opened(self):
        """打开页面之后的操作"""


if __name__ == '__main__':
    MZDPSpider().main()

2、抖音视频批量抓取

import random
import time
from spider.media_spider import VideoSpider


class DouYinSpider(VideoSpider):
    """抖音up账号视频爬虫"""
    # 文件的开头路径
    DEBUG = True
    # 要抓取的抖音up主的首页地址
    start_url: str = 'https://www.douyin.com/user/MS4wLjABAAAAOlZ8ngnt417GKBbFysKt2Q8ERj84-Wb9xypbB8_hmIc?vid=7369137414838684954'
    # 视频保存路径
    video_save_path: str = r'D:\projectCode\MusenSpider\video\木瓜电影'
    # 视频地址前缀
    video_start_path: str = 'https://v3-weba.douyinvod.com'
    # 下载的视频类型
    file_types: list = ['video/mp4']
    # 从url中提取文件名的规则
    file_name_pattern: str = r'.com/.+?/(.+?)/video'
    # 音频文件标签
    audio_tag: str = 'media-audio-und-mp4a'
    # 视频文件标签
    video_tag: str = 'media-video-hvc1'

    def opened(self):
        # 获取所有的url
        a_list = self.page.locator('//ul[@class="e6wsjNLL bGEvyQfj"]//a').all()
        print("up主的视频数量:", len(a_list))
        for i in range(len(a_list)):
            if i == 0:
                a_list[i].click()
            time.sleep(random.randint(3, 8))
            self.page.mouse.wheel(0, 100)


if __name__ == '__main__':
    DouYinSpider().main()

3、快手视频批量抓取

import random
import time
from spider.media_spider import VideoSpider


class KuaiShouSpider(VideoSpider):
    """快手up账号视频爬虫"""
    # 文件的开头路径
    DEBUG = True
    # 要抓取的抖音up主的首页地址
    start_url: str = 'https://www.kuaishou.com/profile/3x3fy6cyami7ai6'
    # 视频保存路径
    video_save_path: str = r'D:\projectCode\MusenSpider\video\快手'
    # 视频地址前缀
    video_start_path: str = 'https://v3-weba.douyinvod.com'
    # 下载的视频类型
    file_types: list = ['video/mp4']
    # 从url中提取文件名的规则
    file_name_pattern: str = r'&clientCacheKey=(.+?)&'

    def opened(self):
        # 获取所有的url
        a_list = self.page.locator('//div[@class="card-link"]').all()
        print("up主的视频数量:", len(a_list))
        for i in range(len(a_list)):
            print(f"下载第{i + 1}个视频")
            if i == 0:
                a_list[i].click()
            time.sleep(random.randint(3, 8))
            # 切换下一个视频
            self.page.click('//div[@class="switch-item video-switch-next"]')


if __name__ == '__main__':
    KuaiShouSpider().main()

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

spider_ms-1.0.2.tar.gz (12.5 kB view details)

Uploaded Source

Built Distribution

spider_ms-1.0.2-py3-none-any.whl (12.2 kB view details)

Uploaded Python 3

File details

Details for the file spider_ms-1.0.2.tar.gz.

File metadata

  • Download URL: spider_ms-1.0.2.tar.gz
  • Upload date:
  • Size: 12.5 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/5.1.0 CPython/3.10.14

File hashes

Hashes for spider_ms-1.0.2.tar.gz
Algorithm Hash digest
SHA256 4a38dd6797c15d18e265464b59dbcce20dc23150e527718ee2ea3fffc1556c51
MD5 376bd9c13c414ab3e72307a7d164fa54
BLAKE2b-256 2d3459f93b57639bcb81c693c792633e0956457d26542c8ab298fd1e4072ea13

See more details on using hashes here.

File details

Details for the file spider_ms-1.0.2-py3-none-any.whl.

File metadata

  • Download URL: spider_ms-1.0.2-py3-none-any.whl
  • Upload date:
  • Size: 12.2 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/5.1.0 CPython/3.10.14

File hashes

Hashes for spider_ms-1.0.2-py3-none-any.whl
Algorithm Hash digest
SHA256 f46f236e76862670f8acb014d9f577033fc2974dd751baaa08465f6203a00410
MD5 8237e4aec7826e218ec5b092722065de
BLAKE2b-256 f273064ac7b3d4e3cfd56a72d635a472354272871ef952a6b5c8445aa727f2c2

See more details on using hashes here.

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page