自分用・非汎用
Project description
nuki
自分用・非汎用
インストール
uv add nuki
uv run patchright install chromium
uv run camoufox fetch
使用例
スクレイピング
from nuki import wrap_page
from nuki.browser import patchright_page
from nuki.utils import append_csv, from_here, save_log, write_bytes
here = from_here(__file__)
save_log(here('log/scraping.log'))
with patchright_page() as page:
p = wrap_page(page)
p.goto('https://www.foobarbaz1.jp')
pref_urls = p.ss('li.item > ul > li > a').urls
classroom_urls = []
for i, url in enumerate(pref_urls, 1):
print(f'pref_urls {i}/{len(pref_urls)}')
if not p.goto(url):
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
continue
classroom_urls.extend(p.ss('.school-area h4 a').urls)
for i, url in enumerate(classroom_urls, 1):
print(f'classroom_urls {i}/{len(classroom_urls)}')
if not p.goto(url):
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
continue
th_grep = p.ss('th').re
append_csv(here('csv/scrape.csv'), {
'id': i,
'URL': page.url,
'教室名': p.s('h1 .text01').text,
'住所': p.s('.item .mapText').text,
'電話番号': p.s('.item .phoneNumber').text,
'HP': th_grep.s(r'ホームページ').next('td').s('a').url,
'営業時間': th_grep.s(r'営業時間').next('td').text,
'定休日': th_grep.s(r'定休日').next('td').text,
})
p.s('.school-map').screenshot(here(f'media/{i}-screenshot.png'))
if (img_url := p.s('.school-area img').src):
if (res := p.goto(img_url)) and res.ok:
write_bytes(here(f'media/{i}-img.jpg'), res.body())
スクレイピング(HTML丸ごと保存)
from nuki import wrap_page
from nuki.browser import camoufox_page
from nuki.utils import append_csv, from_here, hash_name, save_log, write_text
here = from_here(__file__)
save_log(here('log/scraping.log'))
with camoufox_page() as page:
p = wrap_page(page)
p.goto('https://www.foobarbaz1.jp')
item_urls = p.ss('ul.items > li > a').urls
for i, url in enumerate(item_urls, 1):
print(f'item_urls {i}/{len(item_urls)}')
if not p.goto(url):
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
continue
file_name = f'{hash_name(url)}.html'
if not write_text(here('html') / file_name, p.html(with_url=True, with_saved_at=True)):
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'write_text'})
continue
ローカルHTMLからデータ抽出&Parquet出力
from nuki import wrap_parser
from nuki.utils import from_here, parse_html, save_log, write_parquet
here = from_here(__file__)
save_log(here('log/scraping.log'))
results = []
for i, file_path in enumerate(here('html').glob('*.html')):
print(f'html {i}')
if not (parser := parse_html(file_path)):
continue
p = wrap_parser(parser)
dts = p.ss('dt').re
results.append({
'URL': p.url,
'file_name': file_path.name,
'教室名': p.s('h1 .text02').text,
'住所': p.s('.item .mapText').text,
'所在地': dts.s(r'所在地').next('dd').text,
'交通': dts.s(r'交通').next('dd').text,
'物件番号': dts.s(r'物件番号').next('dd').text,
})
write_parquet(here('parquet/extract.parquet'), results)
ローカルHTMLからデータ抽出&Parquet出力(並列処理)
from pathlib import Path
from nuki import wrap_parser
from nuki.utils import from_here, glob_paths, parse_html, pool_map, write_parquet
def main():
here = from_here(__file__)
html_paths = glob_paths(here('html'), '*.html')
results = [r for r in pool_map(extract, html_paths) if r]
write_parquet(here('parquet/extract.parquet'), results)
def extract(file_path: str) -> dict | None:
if not (parser := parse_html(Path(file_path))):
return None
p = wrap_parser(parser)
dts = p.ss('dt').re
return {
'URL': p.url,
'file_path': file_path,
'教室名': p.s('h1 .text02').text,
'住所': p.s('.item .mapText').text,
'所在地': dts.s(r'所在地').next('dd').text,
'交通': dts.s(r'交通').next('dd').text,
'価格': dts.s(r'価格').next('dd').text,
'設備・条件': dts.s(r'設備').next('dd').text,
'備考': dts.s(r'備考').next('dd').text,
}
if __name__ == '__main__':
main()
License - ライセンス
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
nuki-0.1.10.tar.gz
(13.5 kB
view details)
Built Distribution
Filter files by name, interpreter, ABI, and platform.
If you're not sure about the file name format, learn more about wheel file names.
Copy a direct link to the current filters
nuki-0.1.10-py3-none-any.whl
(11.1 kB
view details)
File details
Details for the file nuki-0.1.10.tar.gz.
File metadata
- Download URL: nuki-0.1.10.tar.gz
- Upload date:
- Size: 13.5 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: python-requests/2.33.1
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
f4723e7515a9be1911c3d6409cb1048c1ca2e2a82f22a5d0d4391142a636044b
|
|
| MD5 |
c30632df5b1580ef95933f0f20e1e1ce
|
|
| BLAKE2b-256 |
ad14efb1f7934e58c0238516ba8f45b424f5df23cee540466e1e27e2cccc3cd7
|
File details
Details for the file nuki-0.1.10-py3-none-any.whl.
File metadata
- Download URL: nuki-0.1.10-py3-none-any.whl
- Upload date:
- Size: 11.1 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: python-requests/2.33.1
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
7f13ee7077d1dc0f5f73e66cfb35f7f7fa53e3066f1d42b031dde5560b959769
|
|
| MD5 |
6a7909d02ce9960f275c73bd2621f7f1
|
|
| BLAKE2b-256 |
67330715059412dcd54920bc68337fbe3330efc34361bc8afb0596f600e68478
|