A cached http fetcher
Project description
Cached http fetcher
This library fetches data via HTTP(S), stores into some storages which is accessible from the internet.
Install
pip install cached-http-fetcher
Usage
The standard way to use this library is store metadata in Redis and store images itself in S3. The images in S3 can be accessed by the internet.
You have to implement your own meta storage extends MetaStorageBase
with Redis and content storage extends ContentStorageBase
with S3.
The whole sample code is following, or see and execute fetch_with_memory.py.
import boto3
from redis import Redis
import cached_http_fetcher
from urllib.parse import urlsplit, quote
from typing import Optional, Iterable
from botocore.exceptions import ClientError
import logging
class RedisMetaStorage(cached_http_fetcher.MetaStorageBase):
def __init__(self, settings):
self.redis = Redis.from_url("redis://redis:6379/0")
def get(self, source_url: str) -> Optional[bytes]:
return self.redis.get(source_url)
def delete(self, source_url: str) -> None:
self.redis.delete(source_url)
def put(self, source_url: str, value: bytes) -> None:
self.redis.set(source_url, value)
class S3ContentStorage(cached_http_fetcher.ContentStorageBase):
def __init__(self, settings):
self.s3 = boto3.client("s3")
self.bucket = "some-bucket"
@staticmethod
def s3_key(source_url: str):
r = urlsplit(source_url)
s3_key = "path-prefix/"
url = ''
if r.scheme == "http":
pass
elif r.scheme == "https":
s3_key += 's/'
else:
raise ValueError
s3_key += r.netloc + r.path
if r.query:
s3_key += "?" + r.query
return quote(s3_key)
def get(self, source_url: str) -> bytes:
try:
obj = self.s3.get_object(Bucket=self.buclet, Key=S3ContentStorage.s3_key(source_url))
return obj["Body"]
except ClientError as ex:
if ex.response["Error"]["Code"] == "NoSuchKey":
return None
else:
raise
def delete(self, source_url: str) -> None:
self.client.delete_object(Bucket=self.bucket, Key=S3ContentStorage.s3_key(source_url))
def put_content(self, source_url: str, value: bytes, cache_control: str, content_type: Optional[str] = None) -> None:
config = dict(ACL="bucket-owner-full-control", Bucket=self.bucket, Key=key, CacheControl=cache_control, Body=value)
if content_type:
config["ContentType"] = content_type
self.s3.put_object(**config)
def cached_url(self, source_url: str) -> str:
return f"https://{self.bucket}.s3.us-west-2.amazonaws.com/{S3ContentStorage.s3_key(source_url)}"
url_list = ["http://www.example.com/image1.jpg", "http://www.example.com/image2.jpg"]
meta_storage = RedisMetaStorage(settings)
content_storage = S3ContentStorage(settings)
logger = logging.getLogger(__name__)
cached_http_fetcher.fetch_urls(url_list, meta_storage, content_storage, logger=logger)
for url in url_list:
meta = cached_http_fetcher.get_meta(url, meta_storage, logger=logger)
print(meta.cached_url)
Develop
make test # test
make dist # release to PyPI
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distributions
No source distribution files available for this release.See tutorial on generating distribution archives.
Built Distribution
File details
Details for the file cached_http_fetcher-0.1.6-py3-none-any.whl
.
File metadata
- Download URL: cached_http_fetcher-0.1.6-py3-none-any.whl
- Upload date:
- Size: 11.8 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/3.4.1 importlib_metadata/4.6.1 pkginfo/1.7.0 requests/2.25.1 requests-toolbelt/0.9.1 tqdm/4.61.0 CPython/3.9.4
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 857f324420351fbd00201549ad0b5a8f3d3eb5174c13c4f54310529d519614e4 |
|
MD5 | 4116d3ba094d3228ac00e4f761f1b47a |
|
BLAKE2b-256 | 48af8d3a02ed9bf5daa1628ed94b8ed659b383e10f6b13a1e389d03e1922270b |