Skip to main content

自分用・非汎用

Project description

nuki

自分用・非汎用

インストール

uv add nuki
uv run patchright install chromium
uv run camoufox fetch

使用例

スクレイピング

from nuki import wrap_page
from nuki.browser import patchright_page
from nuki.utils import append_csv, from_here, save_log, write_bytes

here = from_here(__file__)
save_log(here('log/scraping.log'))

with patchright_page() as page:
    p = wrap_page(page)

    p.goto('https://www.foobarbaz1.jp')
    pref_urls = p.ss('li.item > ul > li > a').urls

    classroom_urls = []
    for i, url in enumerate(pref_urls, 1):
        print(f'pref_urls {i}/{len(pref_urls)}')
        if not p.goto(url):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
            continue
        classroom_urls.extend(p.ss('.school-area h4 a').urls)

    for i, url in enumerate(classroom_urls, 1):
        print(f'classroom_urls {i}/{len(classroom_urls)}')
        if not p.goto(url):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
            continue
        th_grep = p.ss('th').re
        append_csv(here('csv/scrape.csv'), {
            'id': i,
            'URL': page.url,
            '教室名': p.s('h1 .text01').text,
            '住所': p.s('.item .mapText').text,
            '電話番号': p.s('.item .phoneNumber').text,
            'HP': th_grep.s(r'ホームページ').next('td').s('a').url,
            '営業時間': th_grep.s(r'営業時間').next('td').text,
            '定休日': th_grep.s(r'定休日').next('td').text,
        })
        p.s('.school-map').screenshot(here(f'media/{i}-screenshot.png'))
        if (img_url := p.s('.school-area img').src):
            if (res := p.goto(img_url)) and res.ok:
                write_bytes(here(f'media/{i}-img.jpg'), res.body())

スクレイピング(HTML丸ごと保存)

from nuki import wrap_page
from nuki.browser import camoufox_page
from nuki.utils import append_csv, from_here, hash_name, save_log, write_text

here = from_here(__file__)
save_log(here('log/scraping.log'))

with camoufox_page() as page:
    p = wrap_page(page)

    p.goto('https://www.foobarbaz1.jp')
    item_urls = p.ss('ul.items > li > a').urls

    for i, url in enumerate(item_urls, 1):
        print(f'item_urls {i}/{len(item_urls)}')
        if not p.goto(url):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
            continue
        file_name = f'{hash_name(url)}.html'
        if not write_text(here('html') / file_name, p.html(with_url=True, with_saved_at=True)):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'write_text'})
            continue

ローカルHTMLからデータ抽出&Parquet出力

from nuki import wrap_parser
from nuki.utils import from_here, parse_html, save_log, write_parquet

here = from_here(__file__)
save_log(here('log/scraping.log'))

results = []
for i, file_path in enumerate(here('html').glob('*.html')):
    print(f'html {i}')
    if not (parser := parse_html(file_path)):
        continue
    p = wrap_parser(parser)
    dts = p.ss('dt').re
    results.append({
        'URL': p.url,
        'file_name': file_path.name,
        '教室名': p.s('h1 .text02').text,
        '住所': p.s('.item .mapText').text,
        '所在地': dts.s(r'所在地').next('dd').text,
        '交通': dts.s(r'交通').next('dd').text,
        '物件番号': dts.s(r'物件番号').next('dd').text,
    })
write_parquet(here('parquet/extract.parquet'), results)

ローカルHTMLからデータ抽出&Parquet出力(並列処理)

from pathlib import Path

from nuki import wrap_parser
from nuki.utils import from_here, glob_paths, parse_html, pool_map, write_parquet

def main():
    here = from_here(__file__)
    html_paths = glob_paths(here('html'), '*.html')
    results = [r for r in pool_map(extract, html_paths) if r]
    write_parquet(here('parquet/extract.parquet'), results)

def extract(file_path: str) -> dict | None:
    if not (parser := parse_html(Path(file_path))):
        return None
    p = wrap_parser(parser)
    dts = p.ss('dt').re
    return {
        'URL': p.url,
        'file_path': file_path,
        '教室名': p.s('h1 .text02').text,
        '住所': p.s('.item .mapText').text,
        '所在地': dts.s(r'所在地').next('dd').text,
        '交通': dts.s(r'交通').next('dd').text,
        '価格': dts.s(r'価格').next('dd').text,
        '設備・条件': dts.s(r'設備').next('dd').text,
        '備考': dts.s(r'備考').next('dd').text,
    }

if __name__ == '__main__':
    main()

License - ライセンス

MIT

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

nuki-0.1.10.tar.gz (13.5 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

nuki-0.1.10-py3-none-any.whl (11.1 kB view details)

Uploaded Python 3

File details

Details for the file nuki-0.1.10.tar.gz.

File metadata

  • Download URL: nuki-0.1.10.tar.gz
  • Upload date:
  • Size: 13.5 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: python-requests/2.33.1

File hashes

Hashes for nuki-0.1.10.tar.gz
Algorithm Hash digest
SHA256 f4723e7515a9be1911c3d6409cb1048c1ca2e2a82f22a5d0d4391142a636044b
MD5 c30632df5b1580ef95933f0f20e1e1ce
BLAKE2b-256 ad14efb1f7934e58c0238516ba8f45b424f5df23cee540466e1e27e2cccc3cd7

See more details on using hashes here.

File details

Details for the file nuki-0.1.10-py3-none-any.whl.

File metadata

  • Download URL: nuki-0.1.10-py3-none-any.whl
  • Upload date:
  • Size: 11.1 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: python-requests/2.33.1

File hashes

Hashes for nuki-0.1.10-py3-none-any.whl
Algorithm Hash digest
SHA256 7f13ee7077d1dc0f5f73e66cfb35f7f7fa53e3066f1d42b031dde5560b959769
MD5 6a7909d02ce9960f275c73bd2621f7f1
BLAKE2b-256 67330715059412dcd54920bc68337fbe3330efc34361bc8afb0596f600e68478

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page