bluemoss enables you to easily scrape websites.
Project description
bluemoss
a simple way to scrape the web
<html>
<head>
<title>Portfolio</title>
</head>
<body>
<li>
<div>
<a href="/portfolio?company=apple">
Apple
</a>
<div class="location_us">
<p>Cupertino, California</p>
</div>
</div>
</li>
<li>
<div>
<a href="/portfolio?company=google">
Google
</a>
<div class="location_us">
<p>Mountain View, California</p>
</div>
</div>
</li>
<li>
<div>
<a href="/portfolio?company=tesla">
Tesla
</a>
<div class="location_us">
<p>Austin, Texas</p>
</div>
</div>
</li>
<li>
<div>
<a href="/portfolio?company=deepmind">
DeepMind
</a>
<div class="location_uk">
<p>London, United Kingdom</p>
</div>
</div>
</li>
</body>
</html>
Example 1
```python from bluemoss import Node, scrapenode = Node('a'), # scrape(node, HTML) == 'Apple' node = Node('div/a') # scrape(node, HTML) == 'Apple' node = Node('body//a') # scrape(node, HTML) == 'Apple' node = Node('body//div/a') # scrape(node, HTML) == 'Apple' node = Node('a', filter=0) # scrape(node, HTML) == 'Apple'
<br/>
<br/>
<h3>Example 2</h3>
```python
from bluemoss import Node, scrape
node = Node('p') # 'Cupertino, California'
node = Node('li//p') # 'Cupertino, California'
node = Node('div/p') # 'Cupertino, California'
node = Node('div//p') # 'Cupertino, California'
node = Node('p[contains(@class, "location_")]') # 'Cupertino, California'
node = Node('div//p[contains(@class, "location_")]') # 'Cupertino, California'
Example 3
```python from bluemoss import Node, scrapenode = Node('a', filter=3), # 'DeepMind' node = Node('a', filter=[0, 2]), # ['Apple', 'Tesla'] node = Node('a', filter=Range(2)) # ['Tesla', 'DeepMind'] node = Node('a', filter=Range(2, 4)) # ['Tesla', 'DeepMind'] node = Node('a', filter=Range(2, 3)) # ['Tesla'] node = Node('a', filter=None) # ['Apple', 'Google', 'Tesla', 'DeppMind']
<br>
<br>
<h3>Example 4</h3>
```python
from bluemoss import Node, Range, Ex, scrape
node_1 = Node(
'a',
filter=Range(1),
extract=Ex.HREF,
transformer=lambda href: href.split('?')[1]
)
# scrape(node_1, HTML) == ['google', 'tesla', 'deepmind']
node_2 = Node(
'a',
filter=Range(1),
extract="href",
transformer=lambda href: href.split('?')[1]
)
# scrape(node_2, HTML) == ['google', 'tesla', 'deepmind']
node_3 = Node(
'a/@href',
filter=Range(1),
transformer=lambda href: href.split('?')[1]
)
# scrape(node_3, HTML) == ['google', 'tesla', 'deepmind']
Example 5
from bluemoss import Node, scrape
node = Node(
'li',
filter=None,
nodes=[
Node('a', key='name'),
Node('p', key='headquarters'),
Node(
'a/@href',
key='id',
transform=lambda href: href.split("=")[1]
)
]
)
print(scrape(node, HTML))
Output
[
{
'id': 'apple',
'name': 'Apple',
'headquarters': 'Cupertino, California'
},
{
'id': 'google',
'name': 'Google',
'headquarters': 'Mountain View, California'
},
{
'id': 'tesla',
'name': 'Tesla',
'headquarters': 'Austin, Texas'
},
{
'id': 'deepmind',
'name': 'DeepMind',
'headquarters': 'London, United Kingdom'
}
]
Example 6
from bluemoss import Node, scrape
node = Node(
'li',
filter=None,
key='companies',
nodes=[
Node('a', key='name'),
Node('p', key='headquarters'),
Node(
'a/@href',
key='id',
transform=lambda href: href.split("=")[1]
)
]
)
print(scrape(node, HTML))
Output
{
'companies': [
{
'id': 'apple',
'name': 'Apple',
'headquarters': 'Cupertino, California'
},
{
'id': 'google',
'name': 'Google',
'headquarters': 'Mountain View, California'
},
{
'id': 'tesla',
'name': 'Tesla',
'headquarters': 'Austin, Texas'
},
{
'id': 'deepmind',
'name': 'DeepMind',
'headquarters': 'London, United Kingdom'
}
]
}
Example 7
from __future__ import annotations
from dataclasses import dataclass
from bluemoss import Node, Range, Ex, scrape
@dataclass
class Companies:
companies: list[Company]
amount_uk_companies: int
amount_us_companies: int
@dataclass
class Company:
id: str
name: str
location: str
node = Node(
target=Companies,
nodes=[
Node(
key='amount_uk_companies',
transform=lambda tags: len(tags)
),
Node(
key='amount_us_companies',
transform=lambda tags: len(tags)
),
Node(
'li',
filter=None,
key='companies',
target=Company,
nodes=[
Node('a', key='name'),
Node('p', key='headquarters'),
Node(
'a',
key='id',
extract="href",
transform=lambda href: href.split("=")[1]
)
]
)
]
)
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Hashes for bluemoss-0.1.30-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 3e6c7149c3efb2d81a368349116015fb9593798357f0948f23530047fc4675a9 |
|
MD5 | 94e4394c310c589a799173f4fb0537b2 |
|
BLAKE2b-256 | 148edec9f87befbe154074887e33a6586153d92b0b16a78ed1d0e3b06f3c6dc8 |