Skip to main content

A cached http fetcher

Project description

Cached http fetcher

Test PyPI version

This library fetches data via HTTP(S), stores into some storages which is accessible from the internet.

Install

pip install cached-http-fetcher

Usage

The standard way to use this library is store metadata in Redis and store images itself in S3. The images in S3 can be accessed by the internet.

You have to implement your own meta storage extends MetaStorageBase with Redis and content storage extends ContentStorageBase with S3.

The whole sample code is following, or see and execute fetch_with_memory.py.

import boto3
from redis import Redis
import cached_http_fetcher
from urllib.parse import urlsplit, quote
from typing import Optional, Iterable
from botocore.exceptions import ClientError

import logging


class RedisMetaStorage(cached_http_fetcher.MetaStorageBase):
    def __init__(self, settings):
        self.redis = Redis.from_url("redis://redis:6379/0")

    def get(self, source_url: str) -> Optional[bytes]:
        return self.redis.get(source_url)

    def delete(self, source_url: str) -> None:
        self.redis.delete(source_url)

    def put(self, source_url: str, value: bytes) -> None:
        self.redis.set(source_url, value)


class S3ContentStorage(cached_http_fetcher.ContentStorageBase):
    def __init__(self, settings):
        self.s3 = boto3.client("s3")
        self.bucket = "some-bucket"

    @staticmethod
    def s3_key(source_url: str):
        r = urlsplit(source_url)
        s3_key = "path-prefix/"
        url = ''
        if r.scheme == "http":
            pass
        elif r.scheme == "https":
            s3_key += 's/'
        else:
            raise ValueError
        s3_key += r.netloc + r.path
        if r.query:
            s3_key += "?" + r.query
        return quote(s3_key)

    def get(self, source_url: str) -> bytes:
        try:
            obj = self.s3.get_object(Bucket=self.buclet, Key=S3ContentStorage.s3_key(source_url))
            return obj["Body"]
        except ClientError as ex:
            if ex.response["Error"]["Code"] == "NoSuchKey":
                return None
            else:
                raise

    def delete(self, source_url: str) -> None:
        self.client.delete_object(Bucket=self.bucket, Key=S3ContentStorage.s3_key(source_url))

    def put_content(self, source_url: str, value: bytes, cache_control: str, content_type: Optional[str] = None) -> None:
        config = dict(ACL="bucket-owner-full-control", Bucket=self.bucket, Key=key, CacheControl=cache_control, Body=value)
        if content_type:
            config["ContentType"] = content_type
        self.s3.put_object(**config)

    def cached_url(self, source_url: str) -> str:
        return f"https://{self.bucket}.s3.us-west-2.amazonaws.com/{S3ContentStorage.s3_key(source_url)}"


url_list = ["http://www.example.com/image1.jpg", "http://www.example.com/image2.jpg"]
meta_storage = RedisMetaStorage(settings)
content_storage = S3ContentStorage(settings)
logger = logging.getLogger(__name__)

cached_http_fetcher.fetch_urls(url_list, meta_storage, content_storage, logger=logger)

for url in url_list:
    meta = cached_http_fetcher.get_meta(url, meta_storage, logger=logger)
    print(meta.cached_url)

Develop

make test # test
make dist # release to PyPI

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distributions

No source distribution files available for this release.See tutorial on generating distribution archives.

Built Distribution

cached_http_fetcher-0.1.6-py3-none-any.whl (11.8 kB view hashes)

Uploaded Python 3

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page