Source code for ocds_babel.extract

"""
Babel extractors can be specified in configuration files.

For OCDS, you can specify in ``babel_ocds_codelist.cfg``::

    [ocds_codelist: schema/*/codelists/*.csv]
    headers = Title,Description,Extension
    ignore = currency.csv

and in ``babel_ocds_schema.cfg``::

    [ocds_schema: schema/*/*-schema.json]

For BODS, you can specify in ``babel_bods_codelist.cfg``::

    [ocds_codelist: schema/codelists/*.csv]
    headers = title,description,technical note

and in ``babel_bods_schema.cfg``::

    [ocds_schema: schema/*.json]

For OC4IDS, you can specify in a Babel ``.cfg`` file::

    [extractors]
    yaml = ocds_babel.extract:extract_yaml
    [yaml: mapping/sustainability.yaml]
    keys = title,disclosure format,mapping
"""

import csv
import json
import os
from io import StringIO

from ocds_babel import TRANSLATABLE_EXTENSION_METADATA_KEYWORDS, TRANSLATABLE_SCHEMA_KEYWORDS
from ocds_babel.util import text_to_translate



[docs]
def extract_codelist(fileobj, keywords, comment_tags, options):
    """Yield each header, and the values of the specified fields of a codelist CSV file."""
    headers = _get_option_as_list(options, "headers")
    ignore = _get_option_as_list(options, "ignore")

    # Use universal newlines mode, to avoid parsing errors.
    reader = csv.DictReader(StringIO(fileobj.read().decode(), newline=""))
    for fieldname in reader.fieldnames:
        if fieldname:
            yield 0, "", fieldname, ""

    if os.path.basename(fileobj.name) not in ignore:
        for lineno, row in enumerate(reader, 1):
            for key, value in row.items():
                text = text_to_translate(value, key in headers)
                if text:
                    yield lineno, "", text, [key]




[docs]
def extract_schema(fileobj, keywords, comment_tags, options):
    """Yield the "title" and "description" values of a JSON Schema file."""

    def _extract_schema(data, pointer):
        if isinstance(data, list):
            for index, item in enumerate(data):
                yield from _extract_schema(item, pointer=f"{pointer}/{index}")
        elif isinstance(data, dict):
            for key, value in data.items():
                new_pointer = f"{pointer}/{key}"
                yield from _extract_schema(value, pointer=new_pointer)
                text = text_to_translate(value, key in TRANSLATABLE_SCHEMA_KEYWORDS)
                if text:
                    yield 1, "", text, [new_pointer]

    yield from _extract_schema(json.loads(fileobj.read().decode()), "")




[docs]
def extract_extension_metadata(fileobj, keywords, comment_tags, options):
    """Yield the "name" and "description" values of an extension.json file."""
    data = json.loads(fileobj.read().decode())
    for key in TRANSLATABLE_EXTENSION_METADATA_KEYWORDS:
        value = data.get(key)

        if isinstance(value, dict):
            comment = f"/{key}/en"
            value = value.get("en")
        else:
            # old extension.json format
            comment = f"/{key}"

        text = text_to_translate(value)
        if text:
            yield 1, "", text, [comment]




[docs]
def extract_yaml(fileobj, keywords, comment_tags, options):
    """Yield the values of the specified keys of a YAML file."""
    import yaml  # noqa: PLC0415

    keys = _get_option_as_list(options, "keys")

    def _extract_yaml(data, pointer):
        if isinstance(data, list):
            for index, item in enumerate(data):
                yield from _extract_yaml(item, pointer=f"{pointer}/{index}")
        elif isinstance(data, dict):
            for key, value in data.items():
                new_pointer = f"{pointer}/{key}"
                yield from _extract_yaml(value, pointer=new_pointer)
                text = text_to_translate(value, key in keys)
                if text:
                    yield 1, "", text, [new_pointer]

    yield from _extract_yaml(yaml.safe_load(fileobj.read().decode()), "")



def _get_option_as_list(options, key):
    if options:
        return options.get(key, "").split(",")
    return []