Skip to main content

自分用・非汎用

Project description

domx

自分用・非汎用

インストール

uv add domx
uv run patchright install chromium
uv run camoufox fetch

使用例

スクレイピング

from domx import wrap_page
from domx.browser import patchright_page
from domx.utils import append_csv, from_here, save_log, write_bytes

here = from_here(__file__)
save_log(here('log/scraping.log'))

with patchright_page() as page:
    p = wrap_page(page)

    p.goto('https://www.foobarbaz1.jp')
    pref_urls = p.ss('li.item > ul > li > a').urls

    classroom_urls = []
    for i, url in enumerate(pref_urls, 1):
        print(f'pref_urls {i}/{len(pref_urls)}')
        if not p.goto(url):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
            continue
        classroom_urls.extend(p.ss('.school-area h4 a').urls)

    for i, url in enumerate(classroom_urls, 1):
        print(f'classroom_urls {i}/{len(classroom_urls)}')
        if not p.goto(url):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
            continue
        th_grep = p.ss('th').re
        append_csv(here('csv/scrape.csv'), {
            'id': i,
            'URL': page.url,
            '教室名': p.s('h1 .text01').text,
            '住所': p.s('.item .mapText').text,
            '電話番号': p.s('.item .phoneNumber').text,
            'HP': th_grep.s(r'ホームページ').next('td').s('a').url,
            '営業時間': th_grep.s(r'営業時間').next('td').text,
            '定休日': th_grep.s(r'定休日').next('td').text,
        })
        p.s('.school-map').screenshot(here(f'media/{i}-screenshot.png'))
        if (img_url := p.s('.school-area img').src):
            if (res := p.goto(img_url)) and res.ok:
                write_bytes(here(f'media/{i}-img.jpg'), res.body())

スクレイピング(スクショと画像も保存)

import time
from urllib.parse import urlencode

from domx import wrap_page
from domx.browser import patchright_page
from domx.utils import save_log, append_csv, from_here, write_bytes

here = from_here(__file__)
save_log(here('log/scraping.log'))

with patchright_page() as page:
    p = wrap_page(page)
    
    p.goto('https://example.com/demo/search')
    prefecture_urls = p.ss('li > a[href^="https://example.com/demo/search/area/"]').urls

    bukken_urls = []
    for i, prefecture_url in enumerate(prefecture_urls, 1):
        print(f'{i}/{len(prefecture_urls)} エリア一覧ページ')
        page_num = 1
        while True:
            if not p.goto(f'{prefecture_url}?{urlencode({"page": page_num})}'):
                break
            if not (bukken_elems := p.ss('ul li div a[href^="https://example.com"]:has(p)')):
                break
            bukken_urls.extend(bukken_elems.urls)
            page_num += 1
    
    for i, url in enumerate(bukken_urls, 1):
        print(f'{i}/{len(bukken_urls)} 詳細ページ {url}')
        if not p.goto(url):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
            continue
        
        dt_grep = p.ss('h4').re.s(r'概要').next('div:has(dl)').ss('dt').re
        dd_text = lambda pattern: dt_grep.s(pattern).next('dd').text

        append_csv(here('csv/scrape.csv'), {
            'id': i,
            'URL': page.url,
            '価格': dd_text(r'価格'),
            '所在地': dd_text(r'所在地'),
            '交通': dd_text(r'交通'),
            '駐車場': dd_text(r'駐車場'),
            '備考': dd_text(r'備考'),
            '情報更新日': dd_text(r'情報更新日'),
        })
        
        page.add_style_tag(content='header, footer.site-footer { visibility: hidden !important; }')
        
        p.ss('h4').re.s(r'概要').next('div:has(dl)').screenshot(path=here(f'media/{i}-summary.png'))

        elem_iframe = p.s('iframe[src^="https://example.com"]')
        elem_iframe.scroll_into_view()
        time.sleep(3)
        elem_iframe.screenshot(path=here(f'media/{i}-iframe.png'))
        
        main_img_url = p.s('img.w-full.object-contain').src
        
        img_desc_grep = p.ss('p.text-left').re.s(r'画像をクリック').next('ul').ss('li p').re
        img_desc = img_desc_grep.s(r'表紙') or img_desc_grep.s(r'^(?!.*裏面).*')
        img_url = img_desc.parent('li').s('a').url
        
        if main_img_url and (res := p.goto(main_img_url)) and res.ok:
            write_bytes(here(f'media/{i}-main-img.jpg'), res.body())
        if img_url and (res := p.goto(img_url)) and res.ok:
            write_bytes(here(f'media/{i}-img-desc.jpg'), res.body())

スクレイピング(HTML丸ごと保存)

from domx import wrap_page
from domx.browser import camoufox_page
from domx.utils import append_csv, from_here, hash_name, save_log, write_text

here = from_here(__file__)
save_log(here('log/scraping.log'))

with camoufox_page() as page:
    p = wrap_page(page)

    p.goto('https://www.foobarbaz1.jp')
    item_urls = p.ss('ul.items > li > a').urls

    for i, url in enumerate(item_urls, 1):
        print(f'item_urls {i}/{len(item_urls)}')
        if not p.goto(url):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
            continue
        file_name = f'{hash_name(url)}.html'
        if not write_text(here('html') / file_name, p.html(with_url=True, with_saved_at=True)):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'write_text'})
            continue

ローカルHTMLからデータ抽出&Parquet出力

from domx import wrap_parser
from domx.utils import from_here, parse_html, save_log, write_parquet

here = from_here(__file__)
save_log(here('log/scraping.log'))

results = []
for i, file_path in enumerate(here('html').glob('*.html'),1):
    print(f'html {i}')
    if not (parser := parse_html(file_path)):
        continue
    p = wrap_parser(parser)
    dts = p.ss('dt').re
    results.append({
        'URL': p.url,
        'file_name': file_path.name,
        '教室名': p.s('h1 .text02').text,
        '住所': p.s('.item .mapText').text,
        '所在地': dts.s(r'所在地').next('dd').text,
        '交通': dts.s(r'交通').next('dd').text,
        '物件番号': dts.s(r'物件番号').next('dd').text,
    })
write_parquet(here('parquet/extract.parquet'), results)

ローカルHTMLからデータ抽出&Parquet出力(並列処理)

from pathlib import Path

from domx import wrap_parser
from domx.utils import from_here, glob_paths, parse_html, pool_map, write_parquet

def main():
    here = from_here(__file__)
    html_paths = glob_paths(here('html'), '*.html')
    results = [r for r in pool_map(extract, html_paths) if r]
    write_parquet(here('parquet/extract.parquet'), results)

def extract(file_path: str) -> dict | None:
    if not (parser := parse_html(Path(file_path))):
        return None
    p = wrap_parser(parser)
    dts = p.ss('dt').re
    return {
        'URL': p.url,
        'file_path': file_path,
        '教室名': p.s('h1 .text02').text,
        '住所': p.s('.item .mapText').text,
        '所在地': dts.s(r'所在地').next('dd').text,
        '交通': dts.s(r'交通').next('dd').text,
        '価格': dts.s(r'価格').next('dd').text,
        '設備・条件': dts.s(r'設備').next('dd').text,
        '備考': dts.s(r'備考').next('dd').text,
    }

if __name__ == '__main__':
    main()

License - ライセンス

MIT

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

domx-0.1.1.tar.gz (14.4 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

domx-0.1.1-py3-none-any.whl (11.5 kB view details)

Uploaded Python 3

File details

Details for the file domx-0.1.1.tar.gz.

File metadata

  • Download URL: domx-0.1.1.tar.gz
  • Upload date:
  • Size: 14.4 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: python-requests/2.33.1

File hashes

Hashes for domx-0.1.1.tar.gz
Algorithm Hash digest
SHA256 fd1cd7ac22d0a86fd99a3f07c5b4d3fe35e21e3da1065fe0d2ea2d36297e9510
MD5 91991332a0f06b21d87ba99a623d1297
BLAKE2b-256 cfb04bbf1f38a0d019e8fe1c8b7696260461fe0c2d004e24a3ec2caf38a439fc

See more details on using hashes here.

File details

Details for the file domx-0.1.1-py3-none-any.whl.

File metadata

  • Download URL: domx-0.1.1-py3-none-any.whl
  • Upload date:
  • Size: 11.5 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: python-requests/2.33.1

File hashes

Hashes for domx-0.1.1-py3-none-any.whl
Algorithm Hash digest
SHA256 0c38adbf5fc824a80d4b81d99b40ffd61d32a1fdebb908f8940a95d6bdbb62b2
MD5 63eed7952763381402010c1da93494ba
BLAKE2b-256 a1b699b4d0fa58b5e817ac2054df560f81e28fffe11012e36c62ed046f72a2b1

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page