Source code for ocds_babel.extract
"""
Babel extractors can be specified in configuration files.
For OCDS, you can specify in ``babel_ocds_codelist.cfg``::
[ocds_codelist: schema/*/codelists/*.csv]
headers = Title,Description,Extension
ignore = currency.csv
and in ``babel_ocds_schema.cfg``::
[ocds_schema: schema/*/*-schema.json]
For BODS, you can specify in ``babel_bods_codelist.cfg``::
[ocds_codelist: schema/codelists/*.csv]
headers = title,description,technical note
and in ``babel_bods_schema.cfg``::
[ocds_schema: schema/*.json]
For OC4IDS, you can specify in a Babel ``.cfg`` file::
[extractors]
yaml = ocds_babel.extract:extract_yaml
[yaml: mapping/sustainability.yaml]
keys = title,disclosure format,mapping
"""
import csv
import json
import os
from io import StringIO
from ocds_babel import TRANSLATABLE_EXTENSION_METADATA_KEYWORDS, TRANSLATABLE_SCHEMA_KEYWORDS
from ocds_babel.util import text_to_translate
[docs]
def extract_codelist(fileobj, keywords, comment_tags, options):
"""Yield each header, and the values of the specified fields of a codelist CSV file."""
headers = _get_option_as_list(options, "headers")
ignore = _get_option_as_list(options, "ignore")
# Use universal newlines mode, to avoid parsing errors.
reader = csv.DictReader(StringIO(fileobj.read().decode(), newline=""))
for fieldname in reader.fieldnames:
if fieldname:
yield 0, "", fieldname, ""
if os.path.basename(fileobj.name) not in ignore:
for lineno, row in enumerate(reader, 1):
for key, value in row.items():
text = text_to_translate(value, key in headers)
if text:
yield lineno, "", text, [key]
[docs]
def extract_schema(fileobj, keywords, comment_tags, options):
"""Yield the "title" and "description" values of a JSON Schema file."""
def _extract_schema(data, pointer):
if isinstance(data, list):
for index, item in enumerate(data):
yield from _extract_schema(item, pointer=f"{pointer}/{index}")
elif isinstance(data, dict):
for key, value in data.items():
new_pointer = f"{pointer}/{key}"
yield from _extract_schema(value, pointer=new_pointer)
text = text_to_translate(value, key in TRANSLATABLE_SCHEMA_KEYWORDS)
if text:
yield 1, "", text, [new_pointer]
yield from _extract_schema(json.loads(fileobj.read().decode()), "")
[docs]
def extract_extension_metadata(fileobj, keywords, comment_tags, options):
"""Yield the "name" and "description" values of an extension.json file."""
data = json.loads(fileobj.read().decode())
for key in TRANSLATABLE_EXTENSION_METADATA_KEYWORDS:
value = data.get(key)
if isinstance(value, dict):
comment = f"/{key}/en"
value = value.get("en")
else:
# old extension.json format
comment = f"/{key}"
text = text_to_translate(value)
if text:
yield 1, "", text, [comment]
[docs]
def extract_yaml(fileobj, keywords, comment_tags, options):
"""Yield the values of the specified keys of a YAML file."""
import yaml # noqa: PLC0415
keys = _get_option_as_list(options, "keys")
def _extract_yaml(data, pointer):
if isinstance(data, list):
for index, item in enumerate(data):
yield from _extract_yaml(item, pointer=f"{pointer}/{index}")
elif isinstance(data, dict):
for key, value in data.items():
new_pointer = f"{pointer}/{key}"
yield from _extract_yaml(value, pointer=new_pointer)
text = text_to_translate(value, key in keys)
if text:
yield 1, "", text, [new_pointer]
yield from _extract_yaml(yaml.safe_load(fileobj.read().decode()), "")
def _get_option_as_list(options, key):
if options:
return options.get(key, "").split(",")
return []