Skip to content

Archive

Archive

Simple archive implementation for storing scraped files based on anystore

archive_source(uri, *args, url_key_only=False, cache=True, stealthy=False, delay=None, raise_on_error=True, **kwargs)

Archive a remote file and return the archive key

Parameters:

Name Type Description Default
url_key_only bool | None

Compute cache key just by url as fallback

False
cache bool | None

Disable caching at all (force re-fetch)

True
stealthy bool | None

Use random http use agent (for http remote sources)

False
delay int | None

Set a delay before fetching

None
raise_on_error bool | None

Throw exception or just log it.

True

Returns:

Type Description
str

The archive lookup key.

Source code in investigraph/archive.py
@anycache(key_func=make_cache_key, store=get_archive_cache())
@error_handler(max_retries=3)
def archive_source(
    uri: Uri,
    *args,
    url_key_only: bool | None = False,
    cache: bool | None = True,
    stealthy: bool | None = False,
    delay: int | None = None,
    raise_on_error: bool | None = True,
    **kwargs,
) -> str:
    """
    Archive a remote file and return the archive key

    Args:
        url_key_only: Compute cache key just by url as fallback
        cache: Disable caching at all (force re-fetch)
        stealthy: Use random http use agent (for http remote sources)
        delay: Set a delay before fetching
        raise_on_error: Throw exception or just log it.

    Returns:
        The archive lookup key.
    """
    if stealthy:
        kwargs["headers"] = kwargs.pop("headers", {})
        kwargs["headers"]["User-Agent"] = random.choice(AGENTS)
    if delay is not None:
        time.sleep(delay)
    log = get_logger(__name__)
    archive = get_archive()
    key = make_archive_key(uri)
    log.info(f"ARCHIVING {uri} ...", archive=archive.uri, prefix=key)
    try:
        with open_virtual(uri, backend_config=kwargs) as fh:
            key = f"{key}/{fh.checksum}"
            with archive.open(key, "wb") as out:
                out.write(fh.read())
    except Exception as e:
        if raise_on_error:
            raise e
        log.error(str(e))
    return str(key)

get_archive(uri=None) cached

Get the archive where to store remote files.

Set the archive via INVESTIGRAPH_ARCHIVE_URI (see Settings)

Parameters:

Name Type Description Default
uri Uri | None

Use this specific uri instead of the global setting.

None

Returns:

Type Description
BaseStore

The archive store (see anystore)

Source code in investigraph/archive.py
@cache
def get_archive(uri: Uri | None = None) -> BaseStore:
    """
    Get the archive where to store remote files.

    Set the archive via `INVESTIGRAPH_ARCHIVE_URI` (see
    [Settings][investigraph.settings])

    Args:
        uri: Use this specific uri instead of the global setting.

    Returns:
        The archive store (see
            [anystore](https://docs.investigraph.dev/lib/anystore))
    """
    archive = settings.archive.model_copy()
    archive.uri = uri or archive.uri
    return archive.to_store()

make_archive_key(uri)

Make the key prefix based on a file uri.

Example

make_archive_key("https://example.org/files/data.pdf") "example.org/files/data.pdf"

Source code in investigraph/archive.py
def make_archive_key(uri: Uri) -> str:
    """
    Make the key prefix based on a file uri.

    Example:
        >>> make_archive_key("https://example.org/files/data.pdf")
        >>> "example.org/files/data.pdf"
    """
    return join_relpaths(*urlsplit(str(uri))[1:])

open(uri, url_key_only=False, cache=True, stealthy=False, delay=None, raise_on_error=True, mode=None, **kwargs)

Open a file from the archive as a file-like io handler. If it doesn't exist in the archive, it will be stored first.

Parameters:

Name Type Description Default
mode str | None

open mode (default rb)

None
url_key_only bool | None

[only if file doesn't exist in archive yet] Compute cache key just by url as fallback

False
cache bool | None

[only if file doesn't exist in archive yet] Disable caching at all (force re-fetch)

True
stealthy bool | None

[only if file doesn't exist in archive yet] Use random http use agent (for http remote sources)

False
delay int | None

[only if file doesn't exist in archive yet] Set a delay before fetching

None
raise_on_error bool | None

[only if file doesn't exist in archive yet] Throw exception or just log it.

True

Returns:

Type Description
ContextManager[IO[AnyStr]]

The open file handler

Source code in investigraph/archive.py
def open(
    uri: Uri,
    url_key_only: bool | None = False,
    cache: bool | None = True,
    stealthy: bool | None = False,
    delay: int | None = None,
    raise_on_error: bool | None = True,
    mode: str | None = None,
    **kwargs,
) -> ContextManager[IO[AnyStr]]:
    """
    Open a file from the archive as a file-like io handler. If it doesn't exist
    in the archive, it will be stored first.

    Args:
        mode: open mode (default `rb`)
        url_key_only: [only if file doesn't exist in archive yet] Compute cache
            key just by url as fallback
        cache: [only if file doesn't exist in archive yet] Disable caching at
            all (force re-fetch)
        stealthy: [only if file doesn't exist in archive yet] Use random http
            use agent (for http remote sources)
        delay: [only if file doesn't exist in archive yet] Set a delay before
            fetching
        raise_on_error: [only if file doesn't exist in archive yet] Throw
            exception or just log it.

    Returns:
        The open file handler
    """
    key = archive_source(
        uri,
        cache=cache,
        stealthy=stealthy,
        delay=delay,
        raise_on_error=raise_on_error,
        url_key_only=url_key_only,
        **kwargs,
    )
    archive = get_archive()
    return archive.open(key, mode=mode)