wavve parser
Project description
WavveParser
Wavve Popular Episode crawling.
1. Data Crawling Info
#### 3 genres
1) All genre
2) Drama
3) Entertainment
#### 10 page by genre (1 page : 20 / Total 100)
#### columns
- rank : rank by genre
- title : program title
- broadcast
- episode
- date : broadcast day
- genre : all, Drama, Entertainment
- category : Ground Broadcast, General Channel, cableTV
- Datetime : crawling date
2. Package File
import pandas as pd
import numpy as np
import requests
import json
3. Installation
"""python
pip install WavveParser
"""
4. Wavve_parser
"""python
import pandas as pd
import numpy as np
import requests
import json
class WavveParser:
def __init__(self, datetime, apikey):
self.apikey = apikey
contents_id = self.getPopularId()
self.contents_id_dfs = pd.concat(contents_id, ignore_index=True)
self.contents_info = self.getDetailInfo()
category_df, categories = self.ParseDetailInfo()
category_df['category'] = np.where(category_df['broadcast'].isin(
categories['지상파']), '지상파', np.where(category_df['broadcast'].isin(categories['종편']), '종편', '케이블'))
category_df['rank'] = category_df.reset_index()["index"]+1
category_df['DateTime'] = datetime
self.category_df = category_df.reindex(columns = ['rank','title','broadcast','episode', 'date', 'genre', 'category','DateTime'])
def getPopularId(self):
popular_urls = []
contents_id = []
genres = {
'all': '전체',
'01': "드라마",
'02': "예능",
}
for genre in genres:
for page in range(1, 6):
offset = (page-1) * 20
item_url = f'https://apis.pooq.co.kr/cf/vod/popularcontents?WeekDay=all&broadcastid=6339&came=broadcast&contenttype=vod&genre={genre}&limit=20&offset={offset}&orderby=viewtime&page={page}&uiparent=GN2-VN2&uirank=2&uitype=VN2&apikey={self.apikey}&credential=none&device=pc&drm=wm&partner=pooq&pooqzone=none®ion=kor&targetage=auto'
popular_urls.append(item_url)
for popular_url in popular_urls:
req = requests.get(popular_url)
data = json.loads(req.text)
item_ids = [item['event_list'][0]['bodylist'][3].split(
':')[1] for item in data['cell_toplist']['celllist']]
contents_id.append(pd.DataFrame({'id': item_ids}))
return contents_id
def getDetailInfo(self):
detail_info = []
for content_id in self.contents_id_dfs['id']:
detail_url = f'https://apis.pooq.co.kr/vod/contents/{content_id}?device=pc&partner=pooq&pooqzone=none®ion=kor&drm=wm&targetage=auto&apikey={self.apikey}&credential=gnay3eDqvjaYTaFwZFAJ57u0nvz33CA2FoHsr5NsY8OCv2wWeu3ZRgaY9Ci2CjRlAd03D4A%2BIdixX2iwjy6jRFjRGc9qw%2BSkVjGFCJxuSRe86SSYVVK953HfiFKuKb6A3nNVJoHyY6gvgpSgRpyNNeZOzMNkqmc2RcGu%2FWrnAXFDATjT2IpHfym9Ng6rPXCbvkd9q3Y%2FsfQrOSB%2BLRTp4IL6AnvszoJi8ccV9AJhR37vOmwOwiV76z7QJexM054Dhp04KJCHm8HmpZANhV1iOw%3D%3D'
req = requests.get(detail_url)
data = json.loads(req.text)
detail_info.append(data)
return detail_info
def ParseDetailInfo(self):
wavve_list = []
categories = {
'지상파': ['MBC', 'SBS', 'KBS 2TV', 'KBS 1TV', 'KBS'],
'종편': ['MBN', '채널A', 'TV조선', 'TV CHOSUN'],
'케이블': ['MBC Every1', 'KBS JOY', 'OCN', 'YTN', '연합뉴스TV', 'KTH PLAYY', 'SBS Fil']
}
for content_info in self.contents_info:
wavve_list.append({
'title': content_info['programtitle'],
'broadcast': content_info['channelname'],
'episode': content_info['episodenumber'],
'date': content_info['releasedate']+"("+content_info['releaseweekday']+")",
'genre': content_info['genretext'],
})
return pd.DataFrame(wavve_list), categories
"""
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distributions
No source distribution files available for this release.See tutorial on generating distribution archives.
Built Distribution
Close
Hashes for WavveParser-0.1-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | db3343c619bdf58c9ed1f8c3ae1b7bcc1a8f3d4000f9026f5105f644f7458da8 |
|
MD5 | e946236c247c4df553328c62c3a41924 |
|
BLAKE2b-256 | cd59f6047f7e2fb9bc7ccb3c0b98c80cfb36ba7eaf9086e128a0644c15801596 |