Skip to content

Documentation for ToMeDa

tomeda.t101_derive_dataverse_key

This script is used to create a mapping table from dataset keys to dataverse keys, using both dataset metadata and manually matched metadata.

Functions:

  1. get_args() -> argparse.Namespace: This function parses command-line arguments which include paths to dataset keys, manually matched entries, an administration directory, a metadata information table, and an output file.

  2. create_mapping_table(dataset_metadata: Path, manual_matched_metadata: Path | list[Path] = None) -> tuple[list[list[str, str]], list[str], list[str]]: This function creates the mapping table using the dataset metadata and manually matched metadata.

  3. _read_manual_matched_entries(manual_matched_metadata: Path | list[Path] = None) -> dict[str, str]: This function reads manually matched entries from given files.

  4. _create_mapping_table(info_table: dict[str, Any], manual_matched_entries: dict[str, str]) -> apping_rt_type: This function creates the mapping table based on the information table and manually matched entries.

  5. camel_case(st: list[str]) -> str: This function converts a list of strings to camelCase.

The main driver of this script is the 'main' function which processes command line arguments, creates the mapping table, writes the mapping table to a JSON file, and updates the metadata information table file by adding a new field 'dataverse_name' to each entry.

logger module-attribute

logger: TraceLogger = getLogger(__name__)

mapping_rt_type module-attribute

mapping_rt_type = tuple[
    list[list[str]], list[str], list[str]
]

create_mapping_table

create_mapping_table(
    info_table: dict,
    manual_matched_metadata: Path | list[Path] = None,
) -> tuple[dict[str, str], list[str], list[str]]

Create a mapping table using the dataset metadata and manually matched metadata.

Parameters:

Name Type Description Default
dataset_metadata Path

Path to the dataset metadata.

required
manual_matched_metadata Path or list[Path]

Path(s) to the manually matched metadata.

None

Returns:

Type Description
tuple

Tuple containing the mapping table, new dataverse keys, and split keys.

Source code in tomeda/t101_derive_dataverse_key.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def create_mapping_table(
    info_table: dict,
    manual_matched_metadata: Path | list[Path] = None,
) -> tuple[dict[str, str], list[str], list[str]]:
    """
    Create a mapping table using the dataset metadata and manually matched
    metadata.

    Parameters
    ----------
    dataset_metadata : Path
        Path to the dataset metadata.
    manual_matched_metadata : Path or list[Path]
        Path(s) to the manually matched metadata.

    Returns
    -------
    tuple
        Tuple containing the mapping table, new dataverse keys, and split keys.
    """
    # {dataset_key, dataverse_key}
    manual_matched_entries: dict[str, str] = _read_manual_matched_entries(
        manual_matched_metadata
    )

    # mapping_table: list[[dataverse_key, dataset_key]]
    # new_dataset_keys: list[new_dataset_key] (camelCase)
    # split_keys: list[dataset_key] (list of parents)
    mapping_table, new_dataset_keys, split_keys = _create_mapping_table(
        info_table, manual_matched_entries
    )

    mapping_table_ds_to_dv = {e[0]: e[1] for e in mapping_table}

    return mapping_table_ds_to_dv, new_dataset_keys, split_keys

main

main(param: TomedaParameter) -> None
Source code in tomeda/t101_derive_dataverse_key.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def main(param: TomedaParameter) -> None:
    logger.info(f"Derive Dataverse Keys.")

    # manually matched metadata -> based on
    # "tomeda_10_extract_dataverse_native_keys.py"
    manually_matched_keys: list[Path] = [param.matched_entries]
    schema_info_table: Path = param.schema_info_table

    # mapping_table: dict[dataverse_key, dataset_key]
    # new_dataset_keys: list[new_dataset_key] (camelCase)
    # split_keys: list[dataset_key] (list of parents)

    # mapping_table_ds_to_dv, new_dataverse_keys, split_keys \

    schema_file_handle = TomedaFileHandler(schema_info_table, overwrite=True)
    schema_info: dict = nt.loads(schema_file_handle.read(raw=True)[0])

    mapping_table_dv_to_ds, new_dataset_keys, _ = create_mapping_table(
        info_table=schema_info, manual_matched_metadata=manually_matched_keys
    )

    mapping_table_file_handle = TomedaFileHandler(
        schema_info_table.parent / Path("complete_mapping.json"),
        overwrite=param.force_overwrite,
    )
    mapping_table_file_handle.write(
        json.dumps(mapping_table_dv_to_ds, indent=4)
    )

    new_dataset_keys_file_handle = TomedaFileHandler(
        schema_info_table.parent / Path("new_dataset_keys.txt"),
        overwrite=param.force_overwrite,
    )
    new_dataset_keys_file_handle.write([e + "\n" for e in new_dataset_keys])

    to_delete = []
    # add properly formatted dataverse name as defined in the
    # schema definition "name"
    for schema_info_entry, schema_info_value in schema_info.items():
        name = schema_info_value["name"]
        # schema_info_value["title"] = "eng_" + schema_info_value["title"]

        if name in mapping_table_dv_to_ds:
            schema_info_value["dataverse_name"] = mapping_table_dv_to_ds[name]
        else:
            # This value is ignored. So delete it:
            to_delete.append(schema_info_entry)

    for entry in to_delete:
        del schema_info[entry]

    schema_file_handle.write(nt.dumps(schema_info))

    logger.info(f"Derive Dataverse Keys. Done.")

string_list_to_camelCase

string_list_to_camelCase(st: list[str]) -> str

Convert a list of strings to camelCase. Args: st:

Returns:

Type Description
str

A camelCase string.

Source code in tomeda/t101_derive_dataverse_key.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
def string_list_to_camelCase(st: list[str]) -> str:
    """
    Convert a list of strings to camelCase.
    Args:
        st:

    Returns
    -------
    str
        A camelCase string.

    """
    output = "".join(x[0].upper() + x[1:] for x in st if x.isalnum())
    return output[0].lower() + output[1:]