bluemoss enables you to easily scrape websites.
Project description
bluemoss
a simple way to scrape the web
example html
<html>
<head>
<title>Portfolio</title>
</head>
<body>
<li>
<div>
<a href="/portfolio?company=apple">
Apple
</a>
<div class="location_us">
<p>Cupertino, California</p>
</div>
</div>
</li>
<li>
<div>
<a href="/portfolio?company=google">
Google
</a>
<div class="location_us">
<p>Mountain View, California</p>
</div>
</div>
</li>
<li>
<div>
<a href="/portfolio?company=tesla">
Tesla
</a>
<div class="location_us">
<p>Austin, Texas</p>
</div>
</div>
</li>
<li>
<div>
<a href="/portfolio?company=deepmind">
DeepMind
</a>
<div class="location_uk">
<p>London, United Kingdom</p>
</div>
</div>
</li>
</body>
</html>
scraping example 1
from src.bluemoss import Node, scrape
from ..constants import README_EXAMPLE_HTML as HTML
nodes: list[Node] = [
Node('a'),
Node('div/a'),
Node('div/a'),
Node('body//a'),
Node('body//div/a'),
Node('a', filter=0),
]
for node in nodes:
assert scrape(node, HTML) == 'Apple'
scraping example 2
from src.bluemoss import Node, scrape
from ..constants import README_EXAMPLE_HTML as HTML
nodes: list[Node] = [
Node('p'),
Node('li//p'),
Node('div/p'),
Node('div//p'),
Node('div[contains(@class, "location_")]'),
Node('body//div[contains(@class, "location_")]'),
]
for node in nodes:
assert scrape(node, HTML) == 'Cupertino, California'
scraping example 3
from src.bluemoss import Node, Range, scrape
from ..constants import README_EXAMPLE_HTML as HTML
node = Node('a', filter=3)
assert scrape(node, HTML) == 'DeepMind'
node = Node('a', filter=[0, 2])
assert scrape(node, HTML) == ['Apple', 'Tesla']
node = Node('a', filter=Range(2))
assert scrape(node, HTML) == ['Tesla', 'DeepMind']
node = Node('a', filter=Range(2, 4))
assert scrape(node, HTML) == ['Tesla', 'DeepMind']
node = Node('a', filter=Range(2, 3))
assert scrape(node, HTML) == ['Tesla']
node = Node('a', filter=None)
assert scrape(node, HTML) == ['Apple', 'Google', 'Tesla', 'DeepMind']
scraping example 4
from src.bluemoss import Node, Range, Ex, scrape
from ..constants import README_EXAMPLE_HTML as HTML
def get_company_id(hrefs: list[str]) -> list[str]:
return [href.split('=')[-1] for href in hrefs]
for node in [
Node('a', filter=Range(1), extract=Ex.HREF, transform=get_company_id),
Node('a', filter=Range(1), extract='href', transform=get_company_id),
Node('a/@href', filter=Range(1), transform=get_company_id),
]:
assert scrape(node, HTML) == ['google', 'tesla', 'deepmind']
scraping example 5
from src.bluemoss import Node, scrape
from ..constants import README_EXAMPLE_HTML as HTML
node = Node(
'li',
filter=None,
nodes=[
Node('a', key='name'),
Node('p', key='headquarters'),
Node('a/@href', key='id', transform=lambda href: href.split('=')[1]),
],
)
assert scrape(node, HTML) == [
{'id': 'apple', 'name': 'Apple', 'headquarters': 'Cupertino, California'},
{'id': 'google', 'name': 'Google', 'headquarters': 'Mountain View, California'},
{'id': 'tesla', 'name': 'Tesla', 'headquarters': 'Austin, Texas'},
{'id': 'deepmind', 'name': 'DeepMind', 'headquarters': 'London, United Kingdom'},
]
scraping example 6
from src.bluemoss import Node, scrape
from ..constants import README_EXAMPLE_HTML as HTML
node = Node(
'li',
filter=None,
key='companies',
nodes=[
Node('a', key='name'),
Node('p', key='headquarters'),
Node('a/@href', key='id', transform=lambda href: href.split('=')[1]),
],
)
assert scrape(node, HTML) == {
'companies': [
{'id': 'apple', 'name': 'Apple', 'headquarters': 'Cupertino, California'},
{'id': 'google', 'name': 'Google', 'headquarters': 'Mountain View, California'},
{'id': 'tesla', 'name': 'Tesla', 'headquarters': 'Austin, Texas'},
{
'id': 'deepmind',
'name': 'DeepMind',
'headquarters': 'London, United Kingdom',
},
]
}
scraping example 7
from __future__ import annotations
from dataclasses import dataclass
from src.bluemoss import Node, scrape, Jsonify
from ..constants import README_EXAMPLE_HTML as HTML
@dataclass
class Companies(Jsonify):
companies: list[Company]
amount_uk_companies: int
amount_us_companies: int
@dataclass
class Company(Jsonify):
id: str
name: str
location: str
node = Node(
target=Companies,
nodes=[
Node(
"count(//div[@class='location_uk'])",
key='amount_uk_companies',
transform=lambda count: int(count) if count else None,
),
Node(
"count(//div[@class='location_us'])",
key='amount_us_companies',
transform=lambda count: int(count) if count else None,
),
Node(
'li',
filter=None,
key='companies',
target=Company,
nodes=[
Node('a', key='name'),
Node('p', key='location'),
Node(
'a',
key='id',
extract='href',
transform=lambda href: href.split('=')[1],
),
],
),
],
)
companies: Companies = scrape(node, HTML)
assert isinstance(companies, Companies)
assert companies.dict == {
'companies': [
{'id': 'apple', 'name': 'Apple', 'location': 'Cupertino, California'},
{'id': 'google', 'name': 'Google', 'location': 'Mountain View, California'},
{'id': 'tesla', 'name': 'Tesla', 'location': 'Austin, Texas'},
{'id': 'deepmind', 'name': 'DeepMind', 'location': 'London, United Kingdom'},
],
'amount_uk_companies': 1,
'amount_us_companies': 3,
}
assert (
companies.json
== """{
"companies": [
{
"id": "apple",
"name": "Apple",
"location": "Cupertino, California"
},
{
"id": "google",
"name": "Google",
"location": "Mountain View, California"
},
{
"id": "tesla",
"name": "Tesla",
"location": "Austin, Texas"
},
{
"id": "deepmind",
"name": "DeepMind",
"location": "London, United Kingdom"
}
],
"amount_uk_companies": 1,
"amount_us_companies": 3
}"""
)
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
bluemoss-0.1.31.tar.gz
(27.0 kB
view hashes)
Built Distribution
bluemoss-0.1.31-py3-none-any.whl
(21.1 kB
view hashes)
Close
Hashes for bluemoss-0.1.31-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | a902a3bf89cd68e380993bcc5b893dd4468a4864cb3c06201c0630e4b3832b68 |
|
MD5 | 4576c0fd2636095bb3a21031c17de459 |
|
BLAKE2b-256 | c3df2040ea702d2156038a76938ca9697497cee9fe9f44af6544e05d2ce4df54 |