A collection of typed, async Python wrappers for the French National Library Gallica API.
Project description
gallica-getter
Find documents where a word occurs, context for the occurrence, full text for OCR document pages. Compose Gallica services using Python classes that represent each service.
Examples
Here are a few examples from a JSON API I am currently hosting:
async def get_documents_with_occurrences(
args: ContextSearchArgs,
on_get_total_records: Callable[[int], None],
on_get_origin_urls: Callable[[List[str]], None],
session: aiohttp.ClientSession,
semaphore: asyncio.Semaphore,
) -> List[VolumeRecord]:
"""Queries Gallica's SRU API to get metadata for a given term in the archive."""
link = None
if args.link_distance and args.link_term:
link = (args.link_term, args.link_distance)
# get the volumes in which the term appears
volume_Gallica_wrapper = VolumeOccurrence()
gallica_records = await volume_Gallica_wrapper.get(
terms=args.terms,
start_date=make_date_from_year_mon_day(args.year, args.month, args.day),
end_date=make_date_from_year_mon_day(args.end_year, args.end_month, args.day),
codes=args.codes,
source=args.source,
link=link,
limit=args.limit,
start_index=args.cursor or 0,
sort=args.sort,
on_get_total_records=on_get_total_records,
on_get_origin_urls=on_get_origin_urls,
session=session,
semaphore=semaphore,
)
return list(gallica_records)
async def get_sample_context_in_documents(
records: List[VolumeRecord],
session: aiohttp.ClientSession,
semaphore: asyncio.Semaphore,
) -> List[ExtractRoot]:
"""Queries Gallica's search result API to show a sample of context instead of the entire batch."""
# warn if terms length is greater than 1
if any(len(record.terms) > 1 for record in records):
print(
"Warning: using sample context for multi-word terms; only the first term will be used."
)
context_snippet_wrapper = ContextSnippets()
context = await context_snippet_wrapper.get(
[(record.ark, record.terms[0]) for record in records],
session=session,
semaphore=semaphore,
)
return list(context)
async def get_context_include_full_page(
keyed_docs: Dict[str, VolumeRecord],
session: aiohttp.ClientSession,
sem: asyncio.Semaphore,
context_source: Callable,
):
"""Queries Context and PageText to get the text of each page a term occurs on."""
page_text_wrapper = PageText()
queries: List[PageQuery] = []
# build records to be filled with page text for each page w/occurrence
gallica_records: Dict[str, GallicaRecordFullPageText] = {
record.ark: GallicaRecordFullPageText(**record.dict(), context=[])
for record in keyed_docs.values()
}
for context_response in await context_source(
records=list(keyed_docs.values()), session=session, semaphore=sem
):
record = keyed_docs[context_response.ark]
for page in context_response.pages:
queries.append(
PageQuery(
ark=record.ark,
page_num=int(page.page_num),
)
)
page_data = await page_text_wrapper.get(
page_queries=queries, semaphore=sem, session=session
)
for occurrence_page in page_data:
record = gallica_records[occurrence_page.ark]
terms_string = " ".join(record.terms)
record.context.append(
{
"page_num": occurrence_page.page_num,
"text": occurrence_page.text,
"page_url": f"{record.url}/f{occurrence_page.page_num}.image.r={terms_string}",
}
)
return list(gallica_records.values())
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Filter files by name, interpreter, ABI, and platform.
If you're not sure about the file name format, learn more about wheel file names.
Copy a direct link to the current filters
File details
Details for the file gallicagetter-2.0.0.tar.gz.
File metadata
- Download URL: gallicagetter-2.0.0.tar.gz
- Upload date:
- Size: 25.2 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/4.0.2 CPython/3.10.6
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
ca540c54c01d3e492c941b385fad34eeb044f5c4b77dab7b9061848f10eaa3c5
|
|
| MD5 |
f9082f26800b99b56758865c24aa76aa
|
|
| BLAKE2b-256 |
c6ce9b88f3680c88a61614dd46d973b2a242cd25466d918821ee6ab96aff19b2
|
File details
Details for the file gallicagetter-2.0.0-py3-none-any.whl.
File metadata
- Download URL: gallicagetter-2.0.0-py3-none-any.whl
- Upload date:
- Size: 35.8 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/4.0.2 CPython/3.10.6
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
190a67ebcc08055a38d18e4a7847c87d4afdcffda2d511edb93c764670547622
|
|
| MD5 |
13f5cfacc57a87975f639fc154174b6d
|
|
| BLAKE2b-256 |
996d497bb4c55f244f4a4aac4afd9e45653eebe80bd349bb4d93d34b6733a398
|