Skip to main content

A collection of typed, async Python wrappers for the French National Library Gallica API.

Project description

gallica-getter

Find documents where a word occurs, context for the occurrence, full text for OCR document pages. Compose Gallica services using Python classes that represent each service.

Examples

Here are a few examples from a JSON API I am currently hosting:

async def get_documents_with_occurrences(
    args: ContextSearchArgs,
    on_get_total_records: Callable[[int], None],
    on_get_origin_urls: Callable[[List[str]], None],
    session: aiohttp.ClientSession,
    semaphore: asyncio.Semaphore,
) -> List[VolumeRecord]:
    """Queries Gallica's SRU API to get metadata for a given term in the archive."""

    link = None
    if args.link_distance and args.link_term:
        link = (args.link_term, args.link_distance)

    # get the volumes in which the term appears
    volume_Gallica_wrapper = VolumeOccurrence()
    gallica_records = await volume_Gallica_wrapper.get(
        terms=args.terms,
        start_date=make_date_from_year_mon_day(args.year, args.month, args.day),
        end_date=make_date_from_year_mon_day(args.end_year, args.end_month, args.day),
        codes=args.codes,
        source=args.source,
        link=link,
        limit=args.limit,
        start_index=args.cursor or 0,
        sort=args.sort,
        on_get_total_records=on_get_total_records,
        on_get_origin_urls=on_get_origin_urls,
        session=session,
        semaphore=semaphore,
    )

    return list(gallica_records)
async def get_sample_context_in_documents(
    records: List[VolumeRecord],
    session: aiohttp.ClientSession,
    semaphore: asyncio.Semaphore,
) -> List[ExtractRoot]:
    """Queries Gallica's search result API to show a sample of context instead of the entire batch."""

    # warn if terms length is greater than 1
    if any(len(record.terms) > 1 for record in records):
        print(
            "Warning: using sample context for multi-word terms; only the first term will be used."
        )
    context_snippet_wrapper = ContextSnippets()
    context = await context_snippet_wrapper.get(
        [(record.ark, record.terms[0]) for record in records],
        session=session,
        semaphore=semaphore,
    )
    return list(context)
async def get_context_include_full_page(
    keyed_docs: Dict[str, VolumeRecord],
    session: aiohttp.ClientSession,
    sem: asyncio.Semaphore,
    context_source: Callable,
):
    """Queries Context and PageText to get the text of each page a term occurs on."""
    page_text_wrapper = PageText()
    queries: List[PageQuery] = []

    # build records to be filled with page text for each page w/occurrence
    gallica_records: Dict[str, GallicaRecordFullPageText] = {
        record.ark: GallicaRecordFullPageText(**record.dict(), context=[])
        for record in keyed_docs.values()
    }

    for context_response in await context_source(
        records=list(keyed_docs.values()), session=session, semaphore=sem
    ):
        record = keyed_docs[context_response.ark]
        for page in context_response.pages:
            queries.append(
                PageQuery(
                    ark=record.ark,
                    page_num=int(page.page_num),
                )
            )
    page_data = await page_text_wrapper.get(
        page_queries=queries, semaphore=sem, session=session
    )
    for occurrence_page in page_data:
        record = gallica_records[occurrence_page.ark]
        terms_string = " ".join(record.terms)

        record.context.append(
            {
                "page_num": occurrence_page.page_num,
                "text": occurrence_page.text,
                "page_url": f"{record.url}/f{occurrence_page.page_num}.image.r={terms_string}",
            }
        )
    return list(gallica_records.values())

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

gallicagetter-2.0.0.tar.gz (25.2 kB view details)

Uploaded Source

Built Distribution

gallicagetter-2.0.0-py3-none-any.whl (35.8 kB view details)

Uploaded Python 3

File details

Details for the file gallicagetter-2.0.0.tar.gz.

File metadata

  • Download URL: gallicagetter-2.0.0.tar.gz
  • Upload date:
  • Size: 25.2 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/4.0.2 CPython/3.10.6

File hashes

Hashes for gallicagetter-2.0.0.tar.gz
Algorithm Hash digest
SHA256 ca540c54c01d3e492c941b385fad34eeb044f5c4b77dab7b9061848f10eaa3c5
MD5 f9082f26800b99b56758865c24aa76aa
BLAKE2b-256 c6ce9b88f3680c88a61614dd46d973b2a242cd25466d918821ee6ab96aff19b2

See more details on using hashes here.

File details

Details for the file gallicagetter-2.0.0-py3-none-any.whl.

File metadata

File hashes

Hashes for gallicagetter-2.0.0-py3-none-any.whl
Algorithm Hash digest
SHA256 190a67ebcc08055a38d18e4a7847c87d4afdcffda2d511edb93c764670547622
MD5 13f5cfacc57a87975f639fc154174b6d
BLAKE2b-256 996d497bb4c55f244f4a4aac4afd9e45653eebe80bd349bb4d93d34b6733a398

See more details on using hashes here.

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page