A python package to scrape data from Ghana News Portals
Project description
GhanaNews Scraper
A simple unofficial python package to scrape data from GhanaWeb, MyJoyOnline, DailyGraphic, CitiBusinessNews, YenGH, 3News, MyNewsGh, PulseGh Affiliated to Bank of Ghana Fx Rates and GhanaShops-Scraper
How to install
pip install ghananews-scraper
Example Google Colab Notebook
Warning: DO NOT RUN GHANAWEB CODE IN ONLINE Google Colabs)
Some GhanaWeb Urls:
urls = [
"https://www.ghanaweb.com/GhanaHomePage/regional/"
"https://www.ghanaweb.com/GhanaHomePage/editorial/"
"https://www.ghanaweb.com/GhanaHomePage/health/"
"https://www.ghanaweb.com/GhanaHomePage/diaspora/"
"https://www.ghanaweb.com/GhanaHomePage/tabloid/"
"https://www.ghanaweb.com/GhanaHomePage/africa/"
"https://www.ghanaweb.com/GhanaHomePage/religion/"
"https://www.ghanaweb.com/GhanaHomePage/NewsArchive/"
"https://www.ghanaweb.com/GhanaHomePage/business/"
"https://www.ghanaweb.com/GhanaHomePage/SportsArchive/"
"https://www.ghanaweb.com/GhanaHomePage/entertainment/"
"https://www.ghanaweb.com/GhanaHomePage/africa/"
"https://www.ghanaweb.com/GhanaHomePage/television/"
]
Outputs
- All outputs will be saved in a
.csv
file. Other file formats not yet supported.
Usage
from ghanaweb.scraper import GhanaWeb
url = 'https://www.ghanaweb.com/GhanaHomePage/politics/'
# url = "https://www.ghanaweb.com/GhanaHomePage/NewsArchive/"
# url = 'https://www.ghanaweb.com/GhanaHomePage/health/'
# url = 'https://www.ghanaweb.com/GhanaHomePage/crime/'
# url = 'https://www.ghanaweb.com/GhanaHomePage/regional/'
# url = 'https://www.ghanaweb.com/GhanaHomePage/year-in-review/'
# web = GhanaWeb(url='https://www.ghanaweb.com/GhanaHomePage/politics/')
web = GhanaWeb(url=url)
# scrape data and save to `current working dir`
web.download(output_dir=None)
Scrape list of articles from GhanaWeb
from ghanaweb.scraper import GhanaWeb
urls = [
'https://www.ghanaweb.com/GhanaHomePage/politics/',
'https://www.ghanaweb.com/GhanaHomePage/health/',
'https://www.ghanaweb.com/GhanaHomePage/crime/',
'https://www.ghanaweb.com/GhanaHomePage/regional/',
'https://www.ghanaweb.com/GhanaHomePage/year-in-review/'
]
for url in urls:
print(f"Downloading: {url}")
web = GhanaWeb(url=url)
# download to current working directory
# if no location is specified
# web.download(output_dir="/Users/tsiameh/Desktop/")
web.download(output_dir=None)
Scrape data from MyJoyOnline
from myjoyonline.scraper import MyJoyOnline
url = 'https://www.myjoyonline.com/news/'
print(f"Downloading data from: {url}")
joy = MyJoyOnline(url=url)
# download to current working directory
# if no location is specified
# joy.download(output_dir="/Users/tsiameh/Desktop/")
joy.download()
from myjoyonline.scraper import MyJoyOnline
urls = [
'https://www.myjoyonline.com/news/',
'https://www.myjoyonline.com/entertainment/',
'https://www.myjoyonline.com/business/',
'https://www.myjoyonline.com/sports/',
'https://www.myjoyonline.com/opinion/'
]
for url in urls:
print(f"Downloading data from: {url}")
joy = MyJoyOnline(url=url)
# download to current working directory
# if no location is specified
# joy.download(output_dir="/Users/tsiameh/Desktop/")
joy.download()
Scrape data from CitiBusinessNews
from citionline.scraper import CitiBusinessOnline
urls = [
"https://citibusinessnews.com/ghanabusinessnews/features/",
"https://citibusinessnews.com/ghanabusinessnews/telecoms-technology/",
"https://citibusinessnews.com/ghanabusinessnews/international/",
"https://citibusinessnews.com/ghanabusinessnews/news/government/",
"https://citibusinessnews.com/ghanabusinessnews/news/",
"https://citibusinessnews.com/ghanabusinessnews/business/",
"https://citibusinessnews.com/ghanabusinessnews/news/economy/",
"https://citibusinessnews.com/ghanabusinessnews/news/general/",
"https://citibusinessnews.com/ghanabusinessnews/news/top-stories/",
"https://citibusinessnews.com/ghanabusinessnews/business/tourism/"
]
for url in urls:
print(f"Downloading data from: {url}")
citi = CitiBusinessOnline(url=url)
citi.download()
Scrape data from DailyGraphic
from graphiconline.scraper import GraphicOnline
urls = [
"https://www.graphic.com.gh/news.html",
"https://www.graphic.com.gh/news/politics.html",
"https://www.graphic.com.gh/lifestyle.html",
"https://www.graphic.com.gh/news/education.html",
"https://www.graphic.com.gh/native-daughter.html",
"https://www.graphic.com.gh/international.html"
]
for url in urls:
print(f"Downloading data from: {url}")
graphic = GraphicOnline(url=url)
graphic.download()
Scrape data from YenGH
from yen.scraper import Yen
urls = [
"https://www.yen.com.gh/",
"https://yen.com.gh/people/",
"https://yen.com.gh/ghana/",
"https://yen.com.gh/education/",
"https://yen.com.gh/entertainment/",
"https://yen.com.gh/business-economy/",
"https://www.yen.com.gh/politics/",
"https://www.yen.com.gh/world/",
"https://www.yen.com.gh/world/europe/",
"https://www.yen.com.gh/world/asia/",
"https://www.yen.com.gh/world/africa/"
]
for url in urls:
print(f"Downloading data from: {url}")
yen = Yen(url=url)
yen.download()
Scrape data from MyNewsGh
from mynewsgh.scraper import MyNewsGh
# scrape from multiple URLs
urls = [
"https://www.mynewsgh.com/category/politics/",
"https://www.mynewsgh.com/category/news/",
"https://www.mynewsgh.com/category/entertainment/",
"https://www.mynewsgh.com/category/business/",
"https://www.mynewsgh.com/category/lifestyle/",
"https://www.mynewsgh.com/tag/feature/",
"https://www.mynewsgh.com/category/world/",
"https://www.mynewsgh.com/category/sports/"
]
for url in urls:
print(f"Downloading data from: {url}")
my_news = MyNewsGh(url=url, limit_pages=50)
my_news.download()
# scrape from a single URL
from mynewsgh.scraper import MyNewsGh
url = "https://www.mynewsgh.com/category/politics/"
my_news = MyNewsGh(url=url, limit_pages=None)
my_news.download()
Scrape data from 3News
from threenews.scraper import ThreeNews
# DO NOT RUN ALL AUTHORS: select ONLY few
# DO NOT CHANGE THE AUTHOR NAMES
authors = [
"laud-nartey",
"3xtra",
"essel-issac",
"arabaincoom",
"bbc",
"betty-kankam-boadu",
"kwameamoh",
"fiifi_forson",
"fdoku",
"frankappiah",
"godwin-asediba",
"afua-somuah",
"irene",
"joyce-sesi",
"3news_user",
"ntollo",
"pwaberi-denis",
"sonia-amade",
"effah-steven",
"michael-tetteh"
]
for author in authors:
print(f"Downloading data from author: {author}")
three_news = ThreeNews(author=author, limit_pages=50)
three_news.download()
# OR
from threenews.scraper import ThreeNews
three = ThreeNews(author="laud-nartey", limit_pages=None)
three.download()
Scrape data from PulseGh
- select ONLY few urls
- news, entertainment, business, lifestyle has 40 pages
- business/domestic has 25 pages
- business/international has 40 pages
- sports/football has 99 pages
- news/politics has 40 pages
- news/local has 40 pages
- news/world has 40 pages
- news/filla has 38 pages
- entertainment/celebrities has 40 pages
- lifestyle/fashion has 40 pages
- Note: these values may change
from pulsegh.scraper import PulseGh
urls = [
"https://www.pulse.com.gh/news",
"https://www.pulse.com.gh/news/politics",
"https://www.pulse.com.gh/entertainment",
"https://www.pulse.com.gh/lifestyle",
"https://www.pulse.com.gh/sports",
"https://www.pulse.com.gh/sports/football",
"https://www.pulse.com.gh/business/international",
"https://www.pulse.com.gh/business/domestic",
"https://www.pulse.com.gh/business",
"https://www.pulse.com.gh/quizzes"
]
for url in urls:
print(f"Downloading data from: {url}")
pulse = PulseGh(url=url, limit_pages=5)
pulse.download()
# news has 40 pages
from pulsegh.scraper import PulseGh
pulse = PulseGh(url="https://www.pulse.com.gh/news", total_pages = 40, limit_pages=20)
pulse.download()
# Sports/football has 99 pages
from pulsegh.scraper import PulseGh
pulse = PulseGh(url="https://www.pulse.com.gh/sports/football", total_pages=99, limit_pages=None)
pulse.download()
BuyMeCoffee
Credits
Theophilus Siameh
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
ghananews-scraper-1.0.12.tar.gz
(14.5 kB
view hashes)
Built Distribution
Close
Hashes for ghananews_scraper-1.0.12-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 8cbf3f5695e803007dce24be6bf0b1980b0652a118a2440b3a6e49b142b716e3 |
|
MD5 | 67157e6aaca47ae520e81b869b096958 |
|
BLAKE2b-256 | f082abed959b43f8f83f3ee21bfd5e790412f0b7dfeea9b1e1843ca596b2d76b |