This project used for calling WaterCrawl API in Python
Project description
WaterCrawl Python Client
A Python client library for interacting with the WaterCrawl API - a powerful web crawling and scraping service.
Installation
pip install watercrawl-py
Quick Start
from watercrawl import WaterCrawlAPIClient
# Initialize the client
client = WaterCrawlAPIClient('your-api-key')
# Simple URL scraping
result = client.scrape_url('https://example.com')
# Advanced crawling with options
crawl_request = client.create_crawl_request(
url='https://example.com',
spider_options={},
page_options={},
plugin_options={}
)
# Monitor and download results
for result in client.monitor_crawl_request(crawl_request['uuid']):
if result['type'] == 'result':
print(result['data']) # it is a result object per page
API Examples
Client Initialization
from watercrawl import WaterCrawlAPIClient
# Initialize with default base URL
client = WaterCrawlAPIClient('your-api-key')
# Or specify a custom base URL
client = WaterCrawlAPIClient('your-api-key', base_url='https://custom-app.watercrawl.dev/')
Crawling Operations
List all crawl requests
# Get the first page of requests (default page size: 10)
requests = client.get_crawl_requests_list()
# Specify page number and size
requests = client.get_crawl_requests_list(page=2, page_size=20)
Get a specific crawl request
request = client.get_crawl_request('request-uuid')
Create a crawl request
# Simple request with just a URL
request = client.create_crawl_request(url='https://example.com')
# Advanced request with a single URL
request = client.create_crawl_request(
url='https://example.com',
spider_options={
"max_depth": 1, # maximum depth to crawl
"page_limit": 1, # maximum number of pages to crawl
"allowed_domains": [], # allowed domains to crawl
"exclude_paths": [], # exclude paths
"include_paths": [] # include paths
},
page_options={
"exclude_tags": [], # exclude tags from the page
"include_tags": [], # include tags from the page
"wait_time": 1000, # wait time in milliseconds after page load
"include_html": False, # the result will include HTML
"only_main_content": True, # only main content of the page automatically remove headers, footers, etc.
"include_links": False, # if True the result will include links
"timeout": 15000, # timeout in milliseconds
"accept_cookies_selector": None, # accept cookies selector e.g. "#accept-cookies"
"locale": "en-US", # locale
"extra_headers": {}, # extra headers e.g. {"Authorization": "Bearer your_token"}
"actions": [] # actions to perform {"type": "screenshot"} or {"type": "pdf"}
},
plugin_options={}
)
Create a batch crawl request
# Batch crawl multiple URLs at once
request = client.create_batch_crawl_request(
urls=['https://example.com', 'https://example.org'],
spider_options={
"proxy_server": None,
},
page_options={
"exclude_tags": [],
"include_tags": [],
"wait_time": 1000,
"only_main_content": True,
"include_html": False,
"include_links": True,
"timeout": 15000,
"accept_cookies_selector": None,
"locale": None,
"extra_headers": {},
"actions": []
},
plugin_options={}
)
Stop a crawl request
client.stop_crawl_request('request-uuid')
Download a crawl request result
# Download the crawl request as a ZIP file
zip_data = client.download_crawl_request('request-uuid')
# Save to a file
with open('crawl_results.zip', 'wb') as f:
f.write(zip_data)
Monitor a crawl request
# Monitor with automatic result download (default)
for event in client.monitor_crawl_request('request-uuid'):
if event['type'] == 'state':
print(f"Crawl state: {event['data']['status']}")
elif event['type'] == 'result':
print(f"Received result for: {event['data']['url']}")
# you will get also engine feedbacks with type feed here
# Monitor without downloading results will return result as url instead of result object
for event in client.monitor_crawl_request('request-uuid', download=False):
print(f"Event type: {event['type']}")
Get crawl request results
# Get the first page of results
results = client.get_crawl_request_results('request-uuid')
# Specify page number and size
results = client.get_crawl_request_results('request-uuid', page=2, page_size=20)
# Download results directly (instead of just getting URLs)
results = client.get_crawl_request_results('request-uuid', download=True)
Quick URL scraping
# Synchronous scraping (default)
result = client.scrape_url('https://example.com')
# With page options
result = client.scrape_url(
'https://example.com',
page_options={}
)
# Asynchronous scraping
request = client.scrape_url('https://example.com', sync=False)
# Later check for results with get_crawl_request
Sitemap Operations
Get sitemap from a crawl request
# Get sitemap in different formats from a crawl request
sitemap_json = client.get_crawl_request_sitemap('request-uuid', output_format='json')
sitemap_graph = client.get_crawl_request_sitemap('request-uuid', output_format='graph')
sitemap_markdown = client.get_crawl_request_sitemap('request-uuid', output_format='markdown')
# Or use a crawl request object
crawl_request = client.get_crawl_request('request-uuid')
sitemap_json = client.get_crawl_request_sitemap(crawl_request, output_format='json')
Create a dedicated sitemap request
sitemap_request = client.create_sitemap_request(
url='https://example.com',
options={
"include_subdomains": True,
"ignore_sitemap_xml": False,
"search": None,
"include_paths": [],
"exclude_paths": []
}
)
List sitemap requests
# Get the first page of sitemap requests
requests = client.get_sitemap_requests_list()
# Specify page number and size
requests = client.get_sitemap_requests_list(page=2, page_size=20)
Get a specific sitemap request
sitemap_request = client.get_sitemap_request('sitemap-uuid')
Monitor a sitemap request
# Monitor sitemap generation with automatic result download
for event in client.monitor_sitemap_request('sitemap-uuid'):
if event['type'] == 'state':
print(f"Sitemap state: {event['data']['status']}")
elif event['type'] == 'feed':
print(f"Feed: {event['data']['message']}")
Get sitemap results
# Get sitemap in different formats
sitemap_json = client.get_sitemap_results('sitemap-uuid', output_format='json')
sitemap_graph = client.get_sitemap_results('sitemap-uuid', output_format='graph')
sitemap_markdown = client.get_sitemap_results('sitemap-uuid', output_format='markdown')
# You can also use the sitemap request object
sitemap_request = client.get_sitemap_request('sitemap-uuid')
sitemap_json = client.get_sitemap_results(sitemap_request, output_format='json')
Stop a sitemap request
client.stop_sitemap_request('sitemap-uuid')
Deprecated sitemap functions
The following functions are deprecated and will be removed in a future version:
# Use get_crawl_request_sitemap instead
sitemap = client.download_sitemap('request-uuid')
graph_data = client.download_sitemap_graph('request-uuid')
markdown = client.download_sitemap_markdown('request-uuid')
Search Operations
Create a search request
# Simple search
search = client.create_search_request(query="python programming")
# Search with options and limited results
search = client.create_search_request(
query="python tutorial",
search_options={
"language": null, # language code e.g. "en" or "fr" or "es"
"country": null, # country code e.g. "us" or "fr" or "es"
"time_renge": "any", # time range e.g. "any" or "hour" or "day" or "week" or "month" or "year"
"search_type": "web", # search type e.g. "web" now just web is supported
"depth": "basic" # depth e.g. "basic" or "advanced" or "ultimate"
},
result_limit=5, # limit the number of results
sync=True, # wait for results
download=True # download results
)
# Asynchronous search
search = client.create_search_request(
query="machine learning",
search_options={},
result_limit=5, # limit the number of results
sync=False, # Don't wait for results
download=False # Don't download results
)
Monitor a search request
# Monitor with automatic result download the event type just state for now
for event in client.monitor_search_request('search-uuid'):
if event['type'] == 'state':
print(f"Search state: {event['status']}")
# Monitor without downloading results
for event in client.monitor_search_request('search-uuid', download=False):
print(f"Event: {event}")
Get search request details
search = client.get_search_request('search-uuid', download=True)
Stop a search request
client.stop_search_request('search-uuid')
Features
- Simple and intuitive API client
- Support for both synchronous and asynchronous crawling
- Comprehensive crawling options and configurations
- Built-in request monitoring and result downloading
- Efficient session management and request handling
- Support for sitemaps and search operations
Documentation
For detailed documentation and examples, visit WaterCrawl Documentation.
Requirements
- Python >= 3.7
requestslibrary
Compatibility
- WaterCrawl API >= 0.9.2
License
This project is licensed under the MIT License - see the LICENSE file for details.
Support
For support, please visit:
- Issues: GitHub Issues
- Homepage: GitHub Repository
- Documentation: WaterCrawl Docs
Contributing
Contributions are welcome! Please feel free to submit a Pull Request.
Project details
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Filter files by name, interpreter, ABI, and platform.
If you're not sure about the file name format, learn more about wheel file names.
Copy a direct link to the current filters
File details
Details for the file watercrawl_py-0.9.2.tar.gz.
File metadata
- Download URL: watercrawl_py-0.9.2.tar.gz
- Upload date:
- Size: 14.3 kB
- Tags: Source
- Uploaded using Trusted Publishing? Yes
- Uploaded via: twine/6.1.0 CPython/3.12.9
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
1871a2545c42294672f102353a7b68c4309ce605cfa23797d624f306ddc5d2f5
|
|
| MD5 |
be13d8aeb97b4fccbad26e4b69162538
|
|
| BLAKE2b-256 |
a99fa029738b06ba6201453878fe0ff2028dcbeed2b6d537f2ac54ee1459b63d
|
Provenance
The following attestation bundles were made for watercrawl_py-0.9.2.tar.gz:
Publisher:
publish.yml on watercrawl/watercrawl-py
-
Statement:
-
Statement type:
https://in-toto.io/Statement/v1 -
Predicate type:
https://docs.pypi.org/attestations/publish/v1 -
Subject name:
watercrawl_py-0.9.2.tar.gz -
Subject digest:
1871a2545c42294672f102353a7b68c4309ce605cfa23797d624f306ddc5d2f5 - Sigstore transparency entry: 255714418
- Sigstore integration time:
-
Permalink:
watercrawl/watercrawl-py@113c7453ecebeee93d8b23552b6ef85c233d5968 -
Branch / Tag:
refs/tags/v0.9.2 - Owner: https://github.com/watercrawl
-
Access:
public
-
Token Issuer:
https://token.actions.githubusercontent.com -
Runner Environment:
github-hosted -
Publication workflow:
publish.yml@113c7453ecebeee93d8b23552b6ef85c233d5968 -
Trigger Event:
push
-
Statement type:
File details
Details for the file watercrawl_py-0.9.2-py3-none-any.whl.
File metadata
- Download URL: watercrawl_py-0.9.2-py3-none-any.whl
- Upload date:
- Size: 12.5 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? Yes
- Uploaded via: twine/6.1.0 CPython/3.12.9
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
7391dd9be6236083ed6e8f88fa90de877ea6651a090a90301e435dc1a033db79
|
|
| MD5 |
8929027158876451ea377940659e6aca
|
|
| BLAKE2b-256 |
ce8110655f929ba5c10601605cd3a3134592a95a8a671f98f83c28cfcf6ab798
|
Provenance
The following attestation bundles were made for watercrawl_py-0.9.2-py3-none-any.whl:
Publisher:
publish.yml on watercrawl/watercrawl-py
-
Statement:
-
Statement type:
https://in-toto.io/Statement/v1 -
Predicate type:
https://docs.pypi.org/attestations/publish/v1 -
Subject name:
watercrawl_py-0.9.2-py3-none-any.whl -
Subject digest:
7391dd9be6236083ed6e8f88fa90de877ea6651a090a90301e435dc1a033db79 - Sigstore transparency entry: 255714419
- Sigstore integration time:
-
Permalink:
watercrawl/watercrawl-py@113c7453ecebeee93d8b23552b6ef85c233d5968 -
Branch / Tag:
refs/tags/v0.9.2 - Owner: https://github.com/watercrawl
-
Access:
public
-
Token Issuer:
https://token.actions.githubusercontent.com -
Runner Environment:
github-hosted -
Publication workflow:
publish.yml@113c7453ecebeee93d8b23552b6ef85c233d5968 -
Trigger Event:
push
-
Statement type: