自分用・非汎用
Project description
nuki
自分用・非汎用
インストール
uv add nuki
uv run patchright install chromium
uv run camoufox fetch
使用例
スクレイピング
from nuki import wrap_page
from nuki.browser import patchright_page
from nuki.utils import add_log_file, append_csv, from_here
here = from_here(__file__)
add_log_file(here('log/scraping.log'))
with patchright_page() as page:
p = wrap_page(page)
p.goto('https://www.foobarbaz1.jp')
pref_urls = p.ss('li.item > ul > li > a').urls
classroom_urls = []
for i, url in enumerate(pref_urls, 1):
print(f'pref_urls {i}/{len(pref_urls)}')
if not p.goto(url):
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
continue
classroom_urls.extend(p.ss('.school-area h4 a').urls)
for i, url in enumerate(classroom_urls, 1):
print(f'classroom_urls {i}/{len(classroom_urls)}')
if not p.goto(url):
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
continue
th_grep = p.ss('th').re
append_csv(here('csv/scrape.csv'), {
'URL': page.url,
'教室名': p.s('h1 .text01').text,
'住所': p.s('.item .mapText').text,
'電話番号': p.s('.item .phoneNumber').text,
'HP': th_grep.s(r'ホームページ').next('td').s('a').url,
'営業時間': th_grep.s(r'営業時間').next('td').text,
'定休日': th_grep.s(r'定休日').next('td').text,
})
スクレイピング(HTML丸ごと保存)
from nuki import wrap_page
from nuki.browser import camoufox_page
from nuki.utils import add_log_file, append_csv, from_here, hash_name, save_html
here = from_here(__file__)
add_log_file(here('log/scraping.log'))
with camoufox_page() as page:
p = wrap_page(page)
p.goto('https://www.foobarbaz1.jp')
item_urls = p.ss('ul.items > li > a').urls
for i, url in enumerate(item_urls, 1):
print(f'item_urls {i}/{len(item_urls)}')
if not p.goto(url):
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
continue
file_name = f'{hash_name(url)}.html'
if not save_html(here('html') / file_name, p.html(with_url=True)):
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'save_html'})
continue
ローカルHTMLからデータ抽出&Parquet出力
from nuki import wrap_parser
from nuki.utils import add_log_file, from_here, parse_html, write_parquet
here = from_here(__file__)
add_log_file(here('log/scraping.log'))
results = []
for i, file_path in enumerate(here('html').glob('*.html')):
print(f'html {i}')
if not (parser := parse_html(file_path)):
continue
p = wrap_parser(parser)
dts = p.ss('dt').re
results.append({
'URL': p.url,
'file_name': file_path.name,
'教室名': p.s('h1 .text02').text,
'住所': p.s('.item .mapText').text,
'所在地': dts.s(r'所在地').next('dd').text,
'交通': dts.s(r'交通').next('dd').text,
'物件番号': dts.s(r'物件番号').next('dd').text,
})
write_parquet(here('parquet/extract.parquet'), results)
ローカルHTMLからデータ抽出&Parquet出力(並列処理)
from nuki import wrap_parser
from nuki.utils import from_here, glob_paths, parse_html, pool_map, write_parquet
def main():
here = from_here(__file__)
html_paths = glob_paths(here('html'), '*.html')
results = [r for r in pool_map(extract, html_paths) if r]
write_parquet(here('parquet/extract.parquet'), results)
def extract(file_path: str) -> dict | None:
if not (parser := parse_html(file_path)):
return None
p = wrap_parser(parser)
dts = p.ss('dt').re
return {
'URL': p.url,
'file_path': file_path,
'教室名': p.s('h1 .text02').text,
'住所': p.s('.item .mapText').text,
'所在地': dts.s(r'所在地').next('dd').text,
'交通': dts.s(r'交通').next('dd').text,
'価格': dts.s(r'価格').next('dd').text,
'設備・条件': dts.s(r'設備').next('dd').text,
'備考': dts.s(r'備考').next('dd').text,
}
if __name__ == '__main__':
main()
License - ライセンス
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
nuki-0.1.5.tar.gz
(11.9 kB
view details)
Built Distribution
Filter files by name, interpreter, ABI, and platform.
If you're not sure about the file name format, learn more about wheel file names.
Copy a direct link to the current filters
nuki-0.1.5-py3-none-any.whl
(9.6 kB
view details)
File details
Details for the file nuki-0.1.5.tar.gz.
File metadata
- Download URL: nuki-0.1.5.tar.gz
- Upload date:
- Size: 11.9 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: python-requests/2.33.1
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
b2977696765389b0f8967cf5b3f2833918bfcc7c322946905d81c822400c585d
|
|
| MD5 |
d57764a7affdb0f2fb140b080d77a629
|
|
| BLAKE2b-256 |
b316c8d593b15bb3f53b1b86470a7a672de0a6de0b0019fe71f6c4cf7cda8467
|
File details
Details for the file nuki-0.1.5-py3-none-any.whl.
File metadata
- Download URL: nuki-0.1.5-py3-none-any.whl
- Upload date:
- Size: 9.6 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: python-requests/2.33.1
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
ba6b75679c3fb448fbba1971ceaf4e226b7250bfa2cb7fce3828d2c17e2145bf
|
|
| MD5 |
bf89f7dd6eb40c72577ee93a767dde3f
|
|
| BLAKE2b-256 |
ef3c8d74ed4a05764a80f79e8d7e3699bf454f9fe075d5a11d44b670f6309c3f
|