Skip to main content

This project used for calling WaterCrawl API in Python

Project description

WaterCrawl Python Client

PyPI version License: MIT

A Python client library for interacting with the WaterCrawl API - a powerful web crawling and scraping service.

Installation

pip install watercrawl-py

Quick Start

from watercrawl import WaterCrawlAPIClient

# Initialize the client
client = WaterCrawlAPIClient('your-api-key')

# Simple URL scraping
result = client.scrape_url('https://example.com')

# Advanced crawling with options
crawl_request = client.create_crawl_request(
    url='https://example.com',
    spider_options={},
    page_options={},
    plugin_options={}
)

# Monitor and download results
for result in client.monitor_crawl_request(crawl_request['uuid']):
    if result['type'] == 'result':
        print(result['data'])  # it is a result object per page

API Examples

Client Initialization

from watercrawl import WaterCrawlAPIClient

# Initialize with default base URL
client = WaterCrawlAPIClient('your-api-key')

# Or specify a custom base URL
client = WaterCrawlAPIClient('your-api-key', base_url='https://custom-app.watercrawl.dev/')

Crawling Operations

List all crawl requests

# Get the first page of requests (default page size: 10)
requests = client.get_crawl_requests_list()

# Specify page number and size
requests = client.get_crawl_requests_list(page=2, page_size=20)

Get a specific crawl request

request = client.get_crawl_request('request-uuid')

Create a crawl request

# Simple request with just a URL
request = client.create_crawl_request(url='https://example.com')

# Advanced request with a single URL
request = client.create_crawl_request(
    url='https://example.com',
    spider_options={
        "max_depth": 1, # maximum depth to crawl
        "page_limit": 1, # maximum number of pages to crawl
        "allowed_domains": [], # allowed domains to crawl
        "exclude_paths": [], # exclude paths
        "include_paths": [] # include paths
    },
    page_options={
        "exclude_tags": [], # exclude tags from the page
        "include_tags": [], # include tags from the page
        "wait_time": 1000, # wait time in milliseconds after page load
        "include_html": False, # the result will include HTML
        "only_main_content": True, # only main content of the page automatically remove headers, footers, etc.
        "include_links": False, # if True the result will include links
        "timeout": 15000, # timeout in milliseconds
        "accept_cookies_selector": None, # accept cookies selector e.g. "#accept-cookies"
        "locale": "en-US", # locale
        "extra_headers": {}, # extra headers e.g. {"Authorization": "Bearer your_token"}
        "actions": [] # actions to perform {"type": "screenshot"} or {"type": "pdf"}
    },
    plugin_options={}
)

Create a batch crawl request

# Batch crawl multiple URLs at once
request = client.create_batch_crawl_request(
    urls=['https://example.com', 'https://example.org'],
    spider_options={
        "proxy_server": None,
    },
    page_options={
        "exclude_tags": [],
        "include_tags": [],
        "wait_time": 1000,
        "only_main_content": True,
        "include_html": False,
        "include_links": True,
        "timeout": 15000,
        "accept_cookies_selector": None,
        "locale": None,
        "extra_headers": {},
        "actions": []
    },
    plugin_options={}
)

Stop a crawl request

client.stop_crawl_request('request-uuid')

Download a crawl request result

# Download the crawl request as a ZIP file
zip_data = client.download_crawl_request('request-uuid')

# Save to a file
with open('crawl_results.zip', 'wb') as f:
    f.write(zip_data)

Monitor a crawl request

# Monitor with automatic result download (default)
for event in client.monitor_crawl_request('request-uuid'):
    if event['type'] == 'state':
        print(f"Crawl state: {event['data']['status']}")
    elif event['type'] == 'result':
        print(f"Received result for: {event['data']['url']}")
    # you will get also engine feedbacks with type feed here

# Monitor without downloading results will return result as url instead of result object
for event in client.monitor_crawl_request('request-uuid', download=False):
    print(f"Event type: {event['type']}")

Get crawl request results

# Get the first page of results
results = client.get_crawl_request_results('request-uuid')

# Specify page number and size
results = client.get_crawl_request_results('request-uuid', page=2, page_size=20)

# Download results directly (instead of just getting URLs)
results = client.get_crawl_request_results('request-uuid', download=True)

Quick URL scraping

# Synchronous scraping (default)
result = client.scrape_url('https://example.com')

# With page options
result = client.scrape_url(
    'https://example.com',
    page_options={}
)

# Asynchronous scraping
request = client.scrape_url('https://example.com', sync=False)
# Later check for results with get_crawl_request

Sitemap Operations

Get sitemap from a crawl request

# Get sitemap in different formats from a crawl request
sitemap_json = client.get_crawl_request_sitemap('request-uuid', output_format='json')
sitemap_graph = client.get_crawl_request_sitemap('request-uuid', output_format='graph')
sitemap_markdown = client.get_crawl_request_sitemap('request-uuid', output_format='markdown')

# Or use a crawl request object
crawl_request = client.get_crawl_request('request-uuid')
sitemap_json = client.get_crawl_request_sitemap(crawl_request, output_format='json')

Create a dedicated sitemap request

sitemap_request = client.create_sitemap_request(
    url='https://example.com',
    options={
        "include_subdomains": True,
        "ignore_sitemap_xml": False,
        "search": None,
        "include_paths": [],
        "exclude_paths": []
    }
)

List sitemap requests

# Get the first page of sitemap requests
requests = client.get_sitemap_requests_list()

# Specify page number and size
requests = client.get_sitemap_requests_list(page=2, page_size=20)

Get a specific sitemap request

sitemap_request = client.get_sitemap_request('sitemap-uuid')

Monitor a sitemap request

# Monitor sitemap generation with automatic result download
for event in client.monitor_sitemap_request('sitemap-uuid'):
    if event['type'] == 'state':
        print(f"Sitemap state: {event['data']['status']}")
    elif event['type'] == 'feed':
        print(f"Feed: {event['data']['message']}")

Get sitemap results

# Get sitemap in different formats
sitemap_json = client.get_sitemap_results('sitemap-uuid', output_format='json')
sitemap_graph = client.get_sitemap_results('sitemap-uuid', output_format='graph')
sitemap_markdown = client.get_sitemap_results('sitemap-uuid', output_format='markdown')

# You can also use the sitemap request object
sitemap_request = client.get_sitemap_request('sitemap-uuid')
sitemap_json = client.get_sitemap_results(sitemap_request, output_format='json')

Stop a sitemap request

client.stop_sitemap_request('sitemap-uuid')

Deprecated sitemap functions

The following functions are deprecated and will be removed in a future version:

# Use get_crawl_request_sitemap instead
sitemap = client.download_sitemap('request-uuid')
graph_data = client.download_sitemap_graph('request-uuid')
markdown = client.download_sitemap_markdown('request-uuid')

Search Operations

Create a search request

# Simple search
search = client.create_search_request(query="python programming")

# Search with options and limited results
search = client.create_search_request(
    query="python tutorial", 
    search_options={
        "language": null, # language code e.g. "en" or "fr" or "es"
        "country": null, # country code e.g. "us" or "fr" or "es"
        "time_renge": "any", # time range e.g. "any" or "hour" or "day" or "week" or "month" or "year"
        "search_type": "web", # search type e.g. "web" now just web is supported
        "depth": "basic" # depth e.g. "basic" or "advanced" or "ultimate"
    },
    result_limit=5, # limit the number of results
    sync=True, # wait for results
    download=True # download results
)

# Asynchronous search
search = client.create_search_request(
    query="machine learning",
    search_options={},
    result_limit=5, # limit the number of results
    sync=False, # Don't wait for results
    download=False # Don't download results
)

Monitor a search request

# Monitor with automatic result download the event type just state for now
for event in client.monitor_search_request('search-uuid'):
    if event['type'] == 'state':
        print(f"Search state: {event['status']}")
    
# Monitor without downloading results
for event in client.monitor_search_request('search-uuid', download=False):
    print(f"Event: {event}")

Get search request details

search = client.get_search_request('search-uuid', download=True)

Stop a search request

client.stop_search_request('search-uuid')

Features

  • Simple and intuitive API client
  • Support for both synchronous and asynchronous crawling
  • Comprehensive crawling options and configurations
  • Built-in request monitoring and result downloading
  • Efficient session management and request handling
  • Support for sitemaps and search operations

Documentation

For detailed documentation and examples, visit WaterCrawl Documentation.

Requirements

  • Python >= 3.7
  • requests library

Compatibility

  • WaterCrawl API >= 0.9.2

License

This project is licensed under the MIT License - see the LICENSE file for details.

Support

For support, please visit:

Contributing

Contributions are welcome! Please feel free to submit a Pull Request.

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

watercrawl_py-0.9.2.tar.gz (14.3 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

watercrawl_py-0.9.2-py3-none-any.whl (12.5 kB view details)

Uploaded Python 3

File details

Details for the file watercrawl_py-0.9.2.tar.gz.

File metadata

  • Download URL: watercrawl_py-0.9.2.tar.gz
  • Upload date:
  • Size: 14.3 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? Yes
  • Uploaded via: twine/6.1.0 CPython/3.12.9

File hashes

Hashes for watercrawl_py-0.9.2.tar.gz
Algorithm Hash digest
SHA256 1871a2545c42294672f102353a7b68c4309ce605cfa23797d624f306ddc5d2f5
MD5 be13d8aeb97b4fccbad26e4b69162538
BLAKE2b-256 a99fa029738b06ba6201453878fe0ff2028dcbeed2b6d537f2ac54ee1459b63d

See more details on using hashes here.

Provenance

The following attestation bundles were made for watercrawl_py-0.9.2.tar.gz:

Publisher: publish.yml on watercrawl/watercrawl-py

Attestations: Values shown here reflect the state when the release was signed and may no longer be current.

File details

Details for the file watercrawl_py-0.9.2-py3-none-any.whl.

File metadata

  • Download URL: watercrawl_py-0.9.2-py3-none-any.whl
  • Upload date:
  • Size: 12.5 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? Yes
  • Uploaded via: twine/6.1.0 CPython/3.12.9

File hashes

Hashes for watercrawl_py-0.9.2-py3-none-any.whl
Algorithm Hash digest
SHA256 7391dd9be6236083ed6e8f88fa90de877ea6651a090a90301e435dc1a033db79
MD5 8929027158876451ea377940659e6aca
BLAKE2b-256 ce8110655f929ba5c10601605cd3a3134592a95a8a671f98f83c28cfcf6ab798

See more details on using hashes here.

Provenance

The following attestation bundles were made for watercrawl_py-0.9.2-py3-none-any.whl:

Publisher: publish.yml on watercrawl/watercrawl-py

Attestations: Values shown here reflect the state when the release was signed and may no longer be current.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page