Skip to main content

A python package to scrape data from Ghana News Portals

Project description

Python 3.7, 3.8, 3.9 Imports: isort

GhanaNews Scraper

A simple unofficial python package to scrape data from GhanaWeb, MyJoyOnline, DailyGraphic, CitiBusinessNews, YenGH, Affiliated to bank of ghana fx rates and GhanaShops-Scraper

How to install

pip install ghananews-scraper

Example Google Colab Notebook

Click Here: Google Colab Notebook

#f03c15 Warning: DO NOT RUN GHANAWEB CODE IN ONLINE Google Colabs)

Some GhanaWeb Urls:

urls = [
    "https://www.ghanaweb.com/GhanaHomePage/regional/"	
    "https://www.ghanaweb.com/GhanaHomePage/editorial/"
    "https://www.ghanaweb.com/GhanaHomePage/health/"
    "https://www.ghanaweb.com/GhanaHomePage/diaspora/"
    "https://www.ghanaweb.com/GhanaHomePage/tabloid/"
    "https://www.ghanaweb.com/GhanaHomePage/africa/"
    "https://www.ghanaweb.com/GhanaHomePage/religion/"
    "https://www.ghanaweb.com/GhanaHomePage/NewsArchive/"
    "https://www.ghanaweb.com/GhanaHomePage/business/"
    "https://www.ghanaweb.com/GhanaHomePage/SportsArchive/"
    "https://www.ghanaweb.com/GhanaHomePage/entertainment/"
    "https://www.ghanaweb.com/GhanaHomePage/africa/"
    "https://www.ghanaweb.com/GhanaHomePage/television/"
]

Outputs

  • All outputs will be saved in a .csv file. Other file formats not yet supported.

Usage

from ghanaweb.scraper import GhanaWeb

url = 'https://www.ghanaweb.com/GhanaHomePage/politics/'
# url = "https://www.ghanaweb.com/GhanaHomePage/NewsArchive/"
# url = 'https://www.ghanaweb.com/GhanaHomePage/health/'
# url = 'https://www.ghanaweb.com/GhanaHomePage/crime/'
# url = 'https://www.ghanaweb.com/GhanaHomePage/regional/'
# url = 'https://www.ghanaweb.com/GhanaHomePage/year-in-review/'

# web = GhanaWeb(url='https://www.ghanaweb.com/GhanaHomePage/politics/')
web = GhanaWeb(url=url)
# scrape data and save to `current working dir`
web.download(output_dir=None)

Scrape list of articles from GhanaWeb

from ghanaweb.scraper import GhanaWeb

urls = [
        'https://www.ghanaweb.com/GhanaHomePage/politics/',
        'https://www.ghanaweb.com/GhanaHomePage/health/',
        'https://www.ghanaweb.com/GhanaHomePage/crime/',
        'https://www.ghanaweb.com/GhanaHomePage/regional/',
        'https://www.ghanaweb.com/GhanaHomePage/year-in-review/'
    ]

for url in urls:
    print(f"Downloading: {url}")
    web = GhanaWeb(url=url)
    # download to current working directory
    # if no location is specified
    # web.download(output_dir="/Users/tsiameh/Desktop/")
    web.download(output_dir=None)

Scrape data from MyJoyOnline

from myjoyonline.scraper import MyJoyOnline

url = 'https://www.myjoyonline.com/news/'

print(f"Downloading data from: {url}")
joy = MyJoyOnline(url=url)
# download to current working directory
# if no location is specified
# joy.download(output_dir="/Users/tsiameh/Desktop/")
joy.download()
from myjoyonline.scraper import MyJoyOnline

urls = [
        'https://www.myjoyonline.com/news/',
        'https://www.myjoyonline.com/entertainment/',
        'https://www.myjoyonline.com/business/',
        'https://www.myjoyonline.com/sports/',
        'https://www.myjoyonline.com/opinion/'
    ]

for url in urls:
    print(f"Downloading data from: {url}")
    joy = MyJoyOnline(url=url)
    # download to current working directory
    # if no location is specified
    # joy.download(output_dir="/Users/tsiameh/Desktop/")
    joy.download()

Scrape data from CitiBusinessNews

from citionline.scraper import CitiBusinessOnline

urls = [
    "https://citibusinessnews.com/ghanabusinessnews/features/",
    "https://citibusinessnews.com/ghanabusinessnews/telecoms-technology/",
    "https://citibusinessnews.com/ghanabusinessnews/international/",
    "https://citibusinessnews.com/ghanabusinessnews/news/government/",
    "https://citibusinessnews.com/ghanabusinessnews/news/",
    "https://citibusinessnews.com/ghanabusinessnews/business/",
    "https://citibusinessnews.com/ghanabusinessnews/news/economy/",
    "https://citibusinessnews.com/ghanabusinessnews/news/general/",
    "https://citibusinessnews.com/ghanabusinessnews/news/top-stories/",
    "https://citibusinessnews.com/ghanabusinessnews/business/tourism/"
]

for url in urls:
    print(f"Downloading data from: {url}")
    citi = CitiBusinessOnline(url=url)
    citi.download()

Scrape data from DailyGraphic

from graphiconline.scraper import GraphicOnline

urls = [
    "https://www.graphic.com.gh/news.html",
    "https://www.graphic.com.gh/news/politics.html",
    "https://www.graphic.com.gh/lifestyle.html",
    "https://www.graphic.com.gh/news/education.html",
    "https://www.graphic.com.gh/native-daughter.html",
    "https://www.graphic.com.gh/international.html"
]

for url in urls:
    print(f"Downloading data from: {url}")
    graphic = GraphicOnline(url=url)
    graphic.download()

Scrape data from YenGH

from yen.scraper import Yen

urls = [
    "https://www.yen.com.gh/",
    "https://yen.com.gh/people/",
    "https://yen.com.gh/ghana/",
    "https://yen.com.gh/education/",
    "https://yen.com.gh/entertainment/",
    "https://yen.com.gh/business-economy/",
    "https://www.yen.com.gh/politics/",
    "https://www.yen.com.gh/world/",
    "https://www.yen.com.gh/world/europe/",
    "https://www.yen.com.gh/world/asia/",
    "https://www.yen.com.gh/world/africa/"
]

for url in urls:
    print(f"Downloading data from: {url}")
    yen = Yen(url=url)
    yen.download()

Scrape data from MyNewsGh

from mynewsgh.scraper import MyNewsGh

# scrape from multiple URLs
urls = [
  "https://www.mynewsgh.com/category/politics/",
  "https://www.mynewsgh.com/category/news/",
  "https://www.mynewsgh.com/category/entertainment/",
  "https://www.mynewsgh.com/category/business/",
  "https://www.mynewsgh.com/category/lifestyle/",
  "https://www.mynewsgh.com/tag/feature/",
  "https://www.mynewsgh.com/category/world/",
  "https://www.mynewsgh.com/category/sports/"
]

for url in urls:
    print(f"Downloading data from: {url}")
    my_news = MyNewsGh(url=url, limit_pages=50)
    my_news.download()

# scrape from a single URL
from mynewsgh.scraper import MyNewsGh

url = "https://www.mynewsgh.com/category/politics/"
my_news = MyNewsGh(url=url, limit_pages=None)
my_news.download()

Scrape data from 3News

from 3news.scraper import ThreeNews

# DO NOT RUN ALL AUTHORS: select ONLY few
# DO NOT CHANGE THE AUTHOR NAMES
authors = [
  "laud-nartey",
  "3xtra",
  "essel-issac",
  "arabaincoom",
  "bbc",
  "betty-kankam-boadu",
  "kwameamoh",
  "fiifi_forson",
  "fdoku",
  "frankappiah",
  "godwin-asediba",
  "afua-somuah",
  "irene",
  "joyce-sesi",
  "3news_user",
  "ntollo",
  "pwaberi-denis",
  "sonia-amade",
  "effah-steven",
  "michael-tetteh"
]

for author in authors:
    print(f"Downloading data from author: {author}")
    three_news = ThreeNews(author=author, limit_pages=50)
    three_news.download()

# OR
from threenews.scraper import ThreeNews

three = ThreeNews(author="laud-nartey", limit_pages=None)
three.download()

BuyMeCoffee

Build

Credits

  • Theophilus Siameh
tsiameh twitter

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

ghananews-scraper-1.0.10.tar.gz (11.0 kB view details)

Uploaded Source

Built Distribution

ghananews_scraper-1.0.10-py3-none-any.whl (23.4 kB view details)

Uploaded Python 3

File details

Details for the file ghananews-scraper-1.0.10.tar.gz.

File metadata

  • Download URL: ghananews-scraper-1.0.10.tar.gz
  • Upload date:
  • Size: 11.0 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/4.0.2 CPython/3.11.1

File hashes

Hashes for ghananews-scraper-1.0.10.tar.gz
Algorithm Hash digest
SHA256 9526743e529cce3d5d0a3c1735f9c953247dd0c681ae35c24fb80aa29e50abb5
MD5 8c67c3c940c4f5c5215f1e493cc3dc2a
BLAKE2b-256 bf50ca712eba1fe22c335bcc4c57ab07cebfcccbeab227c6f740a73a38dacdf1

See more details on using hashes here.

File details

Details for the file ghananews_scraper-1.0.10-py3-none-any.whl.

File metadata

File hashes

Hashes for ghananews_scraper-1.0.10-py3-none-any.whl
Algorithm Hash digest
SHA256 591efac830a8afa1021ec57483cb70a16df2c3e17008a3bac62019464f8da91e
MD5 a57c5704c5d9d63764b00bda70972003
BLAKE2b-256 0f9d44e5be47e5167e9c5139d3c1d77c321c1d0f69db23d3bfe0366b62914323

See more details on using hashes here.

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page