Skip to main content

bluemoss enables you to easily scrape websites.

Project description

bluemoss logo

bluemoss

a simple way to scrape the web


Code style: blue GitHub Workflow Status (main) Coverage Supported python versions Latest package version



<html>
    <head>
        <title>Portfolio</title>
    </head>
    <body>
        <li>
            <div>
                <a href="/portfolio?company=apple">
                    Apple
                </a>
                <div class="location_us">
                    <p>Cupertino, California</p>
                </div>
            </div>
        </li>
        
        <li>
            <div>
                <a href="/portfolio?company=google">
                    Google
                </a>
                <div class="location_us">
                    <p>Mountain View, California</p>
                </div>
            </div>
        </li>
        
        <li>
            <div>
                 <a href="/portfolio?company=tesla">
                    Tesla
                </a>
                <div class="location_us">
                    <p>Austin, Texas</p>
                </div>
            </div>
        </li>
    
        <li>
            <div>
                <a href="/portfolio?company=deepmind">
                    DeepMind
                </a>
                <div class="location_uk">
                    <p>London, United Kingdom</p>
                </div>
            </div>
        </li>
    </body>
</html>

Example 1

```python from bluemoss import Node, scrape

node = Node('a'), # scrape(node, HTML) == 'Apple' node = Node('div/a') # scrape(node, HTML) == 'Apple' node = Node('body//a') # scrape(node, HTML) == 'Apple' node = Node('body//div/a') # scrape(node, HTML) == 'Apple' node = Node('a', filter=0) # scrape(node, HTML) == 'Apple'


<br/>
<br/>

<h3>Example 2</h3>
```python
from bluemoss import Node, scrape


node = Node('p')                                       # 'Cupertino, California'
node = Node('li//p')                                   # 'Cupertino, California'
node = Node('div/p')                                   # 'Cupertino, California'
node = Node('div//p')                                  # 'Cupertino, California'
node = Node('p[contains(@class, "location_")]')        # 'Cupertino, California'
node = Node('div//p[contains(@class, "location_")]')   # 'Cupertino, California'


Example 3

```python from bluemoss import Node, scrape

node = Node('a', filter=3), # 'DeepMind' node = Node('a', filter=[0, 2]), # ['Apple', 'Tesla'] node = Node('a', filter=Range(2)) # ['Tesla', 'DeepMind'] node = Node('a', filter=Range(2, 4)) # ['Tesla', 'DeepMind'] node = Node('a', filter=Range(2, 3)) # ['Tesla'] node = Node('a', filter=None) # ['Apple', 'Google', 'Tesla', 'DeppMind']


<br>
<br>

<h3>Example 4</h3>
```python
from bluemoss import Node, Range, Ex, scrape


node_1 = Node(
    'a',
    filter=Range(1),
    extract=Ex.HREF,
    transformer=lambda href: href.split('?')[1]
)
# scrape(node_1, HTML) == ['google', 'tesla', 'deepmind']


node_2 = Node(
    'a',
    filter=Range(1),
    extract="href",
    transformer=lambda href: href.split('?')[1]
)
# scrape(node_2, HTML) == ['google', 'tesla', 'deepmind']


node_3 = Node(
    'a/@href',
    filter=Range(1),
    transformer=lambda href: href.split('?')[1]
)
# scrape(node_3, HTML) == ['google', 'tesla', 'deepmind']


Example 5

from bluemoss import Node, scrape


node = Node(
    'li',
    filter=None,
    nodes=[
        Node('a', key='name'),
        Node('p', key='headquarters'),
        Node(
            'a/@href',
            key='id', 
            transform=lambda href: href.split("=")[1]
        )
    ]
)


print(scrape(node, HTML))

Output

[
    {
        'id': 'apple',
        'name': 'Apple', 
        'headquarters': 'Cupertino, California'
    },
    {
        'id': 'google',
        'name': 'Google',
        'headquarters': 'Mountain View, California'
    },
    {
        'id': 'tesla',
        'name': 'Tesla', 
        'headquarters': 'Austin, Texas'
    },
    {
        'id': 'deepmind',
        'name': 'DeepMind',
        'headquarters': 'London, United Kingdom'
    }
]


Example 6

from bluemoss import Node, scrape


node = Node(
    'li',
    filter=None,
    key='companies',
    nodes=[
        Node('a', key='name'),
        Node('p', key='headquarters'),
        Node(
            'a/@href',
            key='id', 
            transform=lambda href: href.split("=")[1]
        )
    ]
)


print(scrape(node, HTML))

Output

{
    'companies': [
        {
            'id': 'apple',
            'name': 'Apple', 
            'headquarters': 'Cupertino, California'
        },
        {
            'id': 'google',
            'name': 'Google',
            'headquarters': 'Mountain View, California'
        },
        {
            'id': 'tesla',
            'name': 'Tesla', 
            'headquarters': 'Austin, Texas'
        },
        {
            'id': 'deepmind',
            'name': 'DeepMind',
            'headquarters': 'London, United Kingdom'
        }
    ]
}


Example 7

from __future__ import annotations
from dataclasses import dataclass
from bluemoss import Node, Range, Ex, scrape


@dataclass
class Companies:
    companies: list[Company]
    amount_uk_companies: int
    amount_us_companies: int

    
@dataclass
class Company:
    id: str
    name: str
    location: str


node = Node(
    target=Companies,
    nodes=[
        Node(
            key='amount_uk_companies',
            transform=lambda tags: len(tags)
        ),
        Node(
            key='amount_us_companies',
            transform=lambda tags: len(tags)
        ),
        Node(
            'li',
            filter=None,
            key='companies',
            target=Company,
            nodes=[
                Node('a', key='name'),
                Node('p', key='headquarters'),
                Node(
                    'a',
                    key='id',
                    extract="href",
                    transform=lambda href: href.split("=")[1]
                )
            ]
        )
    ]
)

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

bluemoss-0.1.30.tar.gz (26.6 kB view hashes)

Uploaded Source

Built Distribution

bluemoss-0.1.30-py3-none-any.whl (20.9 kB view hashes)

Uploaded Python 3

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page