Skip to content

Documentation for ToMeDa

tomeda.t12_create_dataverse_compatible_json

keeping_track_dict module-attribute

keeping_track_dict: dict[str, int] = {}

logger module-attribute

logger: TraceLogger = getLogger(__name__)

sibling_hashmap module-attribute

sibling_hashmap = {}

Child dataclass

Child(value: list[str], path: list[str])

path instance-attribute

path: list[str]

value instance-attribute

value: list[str]

NoMatchError

NoMatchError(message='No match found')

Bases: Exception

Source code in tomeda/t12_create_dataverse_compatible_json.py
85
86
def __init__(self, message="No match found"):
    super().__init__(message)

TSVElementInfo dataclass

TSVElementInfo(
    typeName: str,
    multiple: bool,
    typeClass: Literal[
        "primitive", "compound", "controlledVocabulary"
    ],
    schema: str,
    parent: str | None,
)

multiple instance-attribute

multiple: bool

parent instance-attribute

parent: str | None

schema instance-attribute

schema: str

typeClass instance-attribute

typeClass: Literal[
    "primitive", "compound", "controlledVocabulary"
]

typeName instance-attribute

typeName: str

TSVElements dataclass

TSVElements(elements: dict[str, TSVElementInfo])

Contains the 'TSV_Element_Info' as dict. It is stored with the 'typeName' as key. This is done to ensure that the keys are unique and faster access

elements instance-attribute

elements: dict[str, TSVElementInfo]

assemble_based_on_list_view

assemble_based_on_list_view(
    tsv_data: TSVElements,
    metadata: dict,
    mapping: dict[str, str],
) -> dict[str, Any]

Assembles the metadata based on the list view of the TSV files.

Source code in tomeda/t12_create_dataverse_compatible_json.py
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
def assemble_based_on_list_view(
    tsv_data: TSVElements,
    metadata: dict,
    mapping: dict[str, str],
) -> dict[str, Any]:
    """
    Assembles the metadata based on the list view of the TSV files.
    """
    schema_blocks = {}

    # metadata_flattened = flatten_dict(metadata)
    ignore_list = []
    mapping_reverse = {v: k for k, v in mapping.items()}

    mapping_reverse_fixed = fix_mapping(mapping_reverse)

    for dv_key, dv_info in tsv_data.elements.items():
        """
        Iterate over the tsv_data (list view).
        This is the data that has to be populated
        """

        # Get the mapped dataset key from the dataverse key
        dataset_key = mapping_reverse_fixed.get(dv_key)
        # TSV_keys that were not mapped and already read entries shall be
        # ignored
        # Hier gibt es das Problem, dass elemente, von denen nur die childs
        # definiert sind, nur einmal als primitive object ueberlaufen werden,
        # aber nicht als compound object. d.h. es muss geweahrleistet sein,
        # dass es zu jedem child ein parent gibt
        # Loesung: parent keys hinzufuegen zu mapping_reverse
        # muss man dann Werte ausschliessen wegen potentiell doppelten auslesen?
        if not dataset_key or dv_key in ignore_list:
            continue

        metadata_block_entry = create_metadata_block_entry(
            dv_info,
            dataset_key,
            metadata,
            tsv_data,
            ignore_list,
            mapping_reverse_fixed,
        )
        if metadata_block_entry is None:
            """
            No Metadata for the current element Dataverse Element was found.
            """
            continue

        metadata_block_id: str = dv_info.schema
        schema_blocks.setdefault(metadata_block_id, []).append(
            metadata_block_entry
        )

    metadata_source_list = []
    for metadata_block_id, elements in schema_blocks.items():
        metadata_source = MetadataSource(
            display_name=metadata_block_id,
            fields=elements,
        )
        metadata_source_list.append(metadata_source)

    metadata_source_ = MetadataSource_(metadata_source_list)
    dataset = Dataset(metadata_blocks=metadata_source_, license="")
    dataset_envelope: DatasetEnvelope = DatasetEnvelope(dataset_version=dataset)

    return dataclasses.asdict(dataset_envelope)

compare_strings

compare_strings(
    strings_list: list[str],
) -> list[tuple[int, list[str]]]

Compares a list of strings and identifies the positions where the strings differ.

For each position in the strings where at least one string differs from the others, this function returns that position along with the characters from all strings at that position.

Parameters: - strings_list (list[str]): A list of strings to be compared.

Returns: - list[tuple[int, list[str]]]: A list of tuples where each tuple contains: 1. An integer representing the position in the string where a difference was found. 2. A list of characters (strings of length 1) from all input strings at that position.

Notes: - The function compares strings up to the length of the shortest string in the input list. - If the input list contains less than 2 strings, the function returns a default value of [(0, ["0"])].

Example:

compare_strings(["apple", "aplpe", "appme"]) [(2, ['p', 'l', 'p']), (3, ['l', 'p', 'm'])]

compare_strings(["apple"]) [(0, ['0'])]

Source code in tomeda/t12_create_dataverse_compatible_json.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def compare_strings(strings_list: list[str]) -> list[tuple[int, list[str]]]:
    """
    Compares a list of strings and identifies the positions where the strings differ.

    For each position in the strings where at least one string differs from the others,
    this function returns that position along with the characters from all strings at that position.

    Parameters:
    - strings_list (list[str]): A list of strings to be compared.

    Returns:
    - list[tuple[int, list[str]]]: A list of tuples where each tuple contains:
      1. An integer representing the position in the string where a difference was found.
      2. A list of characters (strings of length 1) from all input strings at that position.

    Notes:
    - The function compares strings up to the length of the shortest string in the input list.
    - If the input list contains less than 2 strings, the function returns a default value of [(0, ["0"])].

    Example:
    >>> compare_strings(["apple", "aplpe", "appme"])
    [(2, ['p', 'l', 'p']), (3, ['l', 'p', 'm'])]

    >>> compare_strings(["apple"])
    [(0, ['0'])]
    """
    # Step 1: If there's not much to compare, return a default result.
    if len(strings_list) < 2:
        return [(0, ["0"])]

    # Step 2: Prepare a list to hold differences.
    differences = []

    # Step 3: Find out the shortest string's length.
    shortest_length = min(len(s) for s in strings_list)

    # Step 4: Go through each position in the string.
    for position in range(shortest_length):
        chars_at_position = [s[position] for s in strings_list]

        # If not all characters are the same, note it down.
        if len(set(chars_at_position)) > 1:
            differences.append((position, chars_at_position))

    return differences

create_children

create_children(
    dataset_key_: str, metadata, sibling: TSVElementInfo
) -> list[MetadataElement]

The goal of this function is to create a list of 'values' (level 1 Metadata Elements for a level 0 level Metadata Element.

This Path is choosen if the level 0 Metadata Element allows multiple values, and the child created here also allows multiple values.

The Problem is that if we only get 1 element, but multiple are defined in the metada file that we only get the first data.

So the 'multiple' two fold. - The level 0 Elements has multiple childen. - The the level 1 child has multiple values.

However, due to the flattening, a metadata element can occur multiple times, and depending on the position of this element in the final tsv structure we would need to differentiate, if the 'multi-match' leads to multiple childs

or multiple values within each child.

Q: To what metadata key will it be mapped? A: parent_name:child.typeName

Source code in tomeda/t12_create_dataverse_compatible_json.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
def create_children(
    dataset_key_: str, metadata, sibling: TSVElementInfo
) -> list[MetadataElement]:
    """
    The goal of this function is to create a list of 'values' (level 1 Metadata Elements
    for a level 0 level Metadata Element.

    This Path is choosen if the level 0 Metadata Element allows multiple values,
    and the child created here also allows multiple values.

    The Problem is that if we only get 1 element, but multiple are defined
    in the metada file that we only get the first data.

    So the 'multiple' two fold.
    - The level 0 Elements has multiple childen.
    - The the level 1 child has multiple values.

    However, due to the flattening, a metadata element can occur multiple times,
    and depending on the position of this element in the final tsv structure
    we would need to differentiate, if the 'multi-match' leads to multiple childs
    or multiple values within each child.
    ---

    Q: To what metadata key will it be mapped?
    A: parent_name:child.typeName

    """

    ds = dataset_key_
    metadata_matches = get_value(dataset_key_, metadata)
    final_name = sibling.parent + ":" + sibling.typeName
    p = metadata_matches.path

    """
    We now have found all values with the correct dataset key.
    However, it is unclear if (possibly) the multiple values are part of the
    multiple parent of part of the multiple childs.

    To solve this, we need to compare the paths of the values.
    We need to find a match of the
    "Dataverse Key --translate--> Dataset Key --extract--> Parent Key"
    We then need to cut off the differences within the parents to get the
    true 'siblings' of the current parent.

    # Thinking area
    parent_name: This it the dataverse parent name, as defined by the TSV schema
    child.typeName: This is the name of the child, as defined by the TSV schema

    dataset_key_: This is the dataset key, as defined by the mapping ( as the stucture of the metadata schema)
    metadata: dict of the metadata in dataset structure (dataset schema structure)

    child_.value = list of values extracted via dataset_key_ (dataset schema structure)
    child_.path = list of paths extracted via dataset_key_ (dataset schema structure)

    ## Parent Deletion / Sibling Extraction
    Based on the information above, the following has to happen:

    I current do not know how to do this.
    As a workaround for now.
    1. Check that the string of the full path has at least two '[' and two ']'. If not, throw error
    2. Assume that the first 'list' is the parent, and the following are concatenated to children.

    #
    """

    if (
        metadata_matches.path[0].count("[") < 1
    ):  # 0th element should always exist
        raise ValueError(
            f"Path {metadata_matches.path[0]} does not contain at least one '[' "
            f"and is therefore not for this code path which assumes multiple parents "
            f"and multiple children."
        )

    def group_children_to_siblings(metadata_matches_: Child) -> list[list[str]]:
        """
        Extracts the siblings from the path.
        This is done by removing the parent from the path.
        """

        # Transform Child with multiple values to list of children with single values
        def transform_child_to_single_values(child: Child) -> list[Child]:
            children = []
            for i in range(len(child.value)):
                children.append(Child(child.value[i], child.path[i]))
            return children

        metadata_matches = transform_child_to_single_values(metadata_matches_)

        all_children = []
        parent_ids = []
        """
        extract the ids of the parents
        """
        for match in metadata_matches:
            dataset_parent_idx = match.path.split(".")[1].strip(
                "[]"
            )  # Value after first '.'
            parent_ids.append(dataset_parent_idx)

        for idx in sorted(set(parent_ids)):
            # get the values, where the children have all the same values
            siblings = []
            for match in metadata_matches:
                if idx == match.path.split(".")[1].strip("[]"):
                    siblings.append(match)
            all_children.append(siblings)
            # dataset_parent = match.path.split('.')[0]  # Value befor first '.'
            # dataset_children = match.path.split('.', 2)[-1]  # Value after second '.'

        return all_children

    def choose_correct_siblings(hash_key) -> int:
        """
        Select the correct siblings.
        Therefor we need to extract the value.
        We globally have to select the correct value.
        """
        global sibling_hashmap

        hash_key = dataset_key_
        index = sibling_hashmap.setdefault(hash_key, int(0))
        sibling_hashmap[hash_key] += 1

        return index

    siblings = group_children_to_siblings(metadata_matches)

    selected_siblings = []

    selected_siblings = siblings[choose_correct_siblings(dataset_key_)]

    children_elements = (
        []
    )  # This are the children of the current parent as MetadataElements
    for i, sibling_ in enumerate(selected_siblings):
        # ToDo: Extract the correct value.
        # You need to make sure to take the nested multiple into account.
        # Also the has might not work,

        """
        Problem. The Datasetkey is used for both mutliple parents and multiple children.
        While this is correct for multiple children, it is not correct for multiple parents
        as the dataset key is not unique for the parents.

        Probably need to modulo the results??
        """
        # This is wrong. as is expects a list with multiple values and not a child.
        # I probably need to restructure it again.

        metadata_element = MetadataElement(
            type_name=sibling.typeName,
            type_class=sibling.typeClass,
            multiple=sibling.multiple,
            value=sibling_.value,
        )

        children_elements.append(metadata_element)

    assert len(children_elements) > 0
    # children_me.append(children_me_1)

    return children_elements

create_compound_metadata_element

create_compound_metadata_element(
    dv_info: TSVElementInfo,
    metadata,
    tsv_data: TSVElements,
    ignore_list: list[str],
    cmp: list[str],
    mapping_reverse_fixed: dict[str, str],
)

Creates a compound metadata element. This include the level0 'compound container' for all children and the children itself.

therefore a 'me' (child) as well as a 'me_level0' (the container) is created.

Source code in tomeda/t12_create_dataverse_compatible_json.py
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
def create_compound_metadata_element(
    dv_info: TSVElementInfo,
    metadata,
    tsv_data: TSVElements,
    ignore_list: list[str],
    cmp: list[str],
    mapping_reverse_fixed: dict[str, str],
):
    """
    Creates a compound metadata element.
    This include the level0 'compound container' for all children and the children itself.

    therefore a 'me' (child) as well as a 'me_level0' (the container) is created.
    """
    children = []

    for name, content in tsv_data.elements.items():
        if content.parent == dv_info.typeName:
            """
            True, if the parent in the tsv file is the same as the typeName of the current compound element.
            This way, only the children of the current compound element are selected.
            """
            children.append(content)
            ignore_list.append(name)

    children_me = []

    for child in children:
        # Iterate through children ('level 1')

        if not mapping_reverse_fixed.get(child.typeName, False):
            continue

        dataset_key_: str = mapping_reverse_fixed[child.typeName]

        try:
            if dv_info.multiple:  # level0 allows multiple values
                """
                If the parent (level0) allows multiple values
                """
                if child.multiple:
                    """
                    If the child (level1) allows multiple values,
                    """
                    child_me = create_children(dataset_key_, metadata, child)
                    children_me.append(child_me)
                else:  # child single
                    """
                    If the child (level1) does NOT allow multiple values, but
                    the parent does. We need to loop over the values of the parent
                    and create a child for each value.
                    """

                    child_: Child = get_value(dataset_key_, metadata)
                    for i in range(len(child_.value)):
                        value = extract_correct_value(
                            child_.value, dataset_key_
                        )

                        me = MetadataElement(
                            type_name=child.typeName,
                            type_class=child.typeClass,
                            multiple=child.multiple,
                            value=value,
                        )
                        child_me = [me]
                        children_me.append(child_me)

            else:  # level0 allows only one value
                # child can have multiple or singluar value(s)
                """
                If the parent (level0) does NOT multiple values
                If the child allows multiple values is handled in the
                create_child_single_single function.
                """
                child_: Child = get_value(dataset_key_, metadata)
                value = extract_correct_value(child_.value, dataset_key_)
                me = MetadataElement(
                    type_name=child.typeName,
                    type_class=child.typeClass,
                    multiple=child.multiple,
                    value=value,
                )
                child_me = me

                children_me.append(child_me)
        except NoMatchError:
            # could not get value from metadata
            continue

    if len(children_me) < 1:
        return None

    metadata_element = MetadataElement(
        type_name=dv_info.typeName,
        type_class=dv_info.typeClass,
        multiple=dv_info.multiple,
        value=children_me,
    )

    return metadata_element

create_metadata_block_entry

create_metadata_block_entry(
    dv_info: TSVElementInfo,
    dataset_key: str,
    metadata,
    tsv_data: TSVElements,
    ignore_list: list[str],
    mapping_reverse_fixed: dict[str, str],
)

Creates a level0 metadata element.

Source code in tomeda/t12_create_dataverse_compatible_json.py
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
def create_metadata_block_entry(
    dv_info: TSVElementInfo,
    dataset_key: str,
    metadata,
    tsv_data: TSVElements,
    ignore_list: list[str],
    mapping_reverse_fixed: dict[str, str],
):
    """
    Creates a level0 metadata element.
    """
    cmp = ["primitive", "controlledVocabulary"]
    if dv_info.typeClass in cmp:
        """
        Create a primitive or controlledVocabulary metadata element.
        This element has no children
        """
        me_level0 = create_primitive_metadata_element(
            dv_info, dataset_key, metadata
        )

    elif dv_info.typeClass == "compound":
        """
        Create a compound metadata element. This element has children.
        """
        me_level0 = create_compound_metadata_element(
            dv_info, metadata, tsv_data, ignore_list, cmp, mapping_reverse_fixed
        )

    else:
        raise ValueError(f"Unknown typeClass: {dv_info.typeClass}")

    if me_level0 is None:
        logger.info(f"No Metadata found for {dv_info.typeName}")

    # metadata_block_id: str = dv_info["schema"]
    # schema_blocks.setdefault(metadata_block_id, []).append(me_level0)
    return me_level0

create_primitive_metadata_element

create_primitive_metadata_element(
    dv_info: TSVElementInfo, dataset_key: str, metadata
)
Source code in tomeda/t12_create_dataverse_compatible_json.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
def create_primitive_metadata_element(
    dv_info: TSVElementInfo, dataset_key: str, metadata
):
    try:
        child = get_value(dataset_key, metadata)
    except NoMatchError:
        return None

    return MetadataElement(
        type_name=dv_info.typeName,
        type_class=dv_info.typeClass,
        multiple=dv_info.multiple,
        value=child.value,
    )

extract_correct_value

extract_correct_value(value, hash_key: str) -> str

This function is used to extract the correct value from the list of values This happens if the level 0 element allows multiple values, but the child only allows one value.

Source code in tomeda/t12_create_dataverse_compatible_json.py
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
def extract_correct_value(value, hash_key: str) -> str:
    """
    This function is used to extract the correct value from the list of values
    This happens if the level 0 element allows multiple values, but the child
    only allows one value.
    """

    global keeping_track_dict

    index = keeping_track_dict.setdefault(hash_key, int(0))
    return_val = value[index]

    keeping_track_dict[hash_key] += 1
    # if keeping_track_dict[hash_key] > 1:
    # print(keeping_track_dict)
    return return_val

extract_data

extract_data(
    tsv_file: Path, instructions: dict = None
) -> dict[str, dict[str, str]]

Extracts keys from a given TSV file.

Args: tsv_file (Path): The input TSV file path. instructions (dict, optional): A dictionary of keys to extract. Defaults to None.

Returns: list: A list of keys. If the key is not specified, the entry 'name' is used.

Source code in tomeda/t12_create_dataverse_compatible_json.py
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
def extract_data(
    tsv_file: Path, instructions: dict = None
) -> dict[str, dict[str, str]]:
    """
    Extracts keys from a given TSV file.

    Args:
        tsv_file (Path): The input TSV file path.
        instructions (dict, optional): A dictionary of keys to extract.
            Defaults to None.

    Returns:
        list: A list of keys. If the key is not specified, the entry 'name' is
            used.
    """
    if instructions is None:
        instructions = {"name": None}

    tsv_data = {}

    controlled_vocab = read_controlled_vocab(tsv_file)

    max_attempts = 5
    attempts = 0
    # There was a weird timing issue when debugging the code.
    # Might be related to opening the file beforehand to read the
    # controlled vocabulary
    while attempts < max_attempts:
        # try:
        file_handler = TomedaFileHandler(tsv_file)

        lines = file_handler.read(strip=False)
        if not lines:
            logger.error(f"File {tsv_file} is empty.")
            return {}
        lines = lines[2:]

        reader = csv.DictReader(lines, delimiter="\t")
        reader.fieldnames = list(map(str.strip, reader.fieldnames))
        for row in reader:
            first_column_value = row["#datasetField"]

            if first_column_value.startswith("#controlledVocabulary"):
                break
            row_data = {}
            for key, value in instructions.items():
                """
                Execute the instructions for each row.
                """
                if value is None:
                    data_ = row.get(key)
                elif isinstance(value, str):
                    data_ = row.get(value)
                elif isinstance(value, Callable):
                    data_ = value(row, controlled_vocab)
                else:
                    raise
                row_data[key] = data_

            tsv_data_key = list(instructions.keys())[
                0
            ]  # ToDo: should this be just the 'name' ?
            tsv_data.update({row_data[tsv_data_key]: row_data})

        break

    # except TypeError as e:
    #     print(f"TypeError: {e}")
    #     attempts += 1
    #     time.sleep(0.1)
    #     raise

    if attempts == max_attempts:
        raise Exception("Failed to open the file after multiple attempts")
    return tsv_data

find_change_idx

find_change_idx(input_list: list) -> list[int]
Source code in tomeda/t12_create_dataverse_compatible_json.py
195
196
197
198
199
200
def find_change_idx(input_list: list) -> list[int]:
    change_indices = []
    for idx in range(1, len(input_list)):
        if input_list[idx] != input_list[idx - 1]:
            change_indices.append(idx)
    return change_indices

fix_mapping

fix_mapping(input_: dict[str, str]) -> dict[str, str]
Source code in tomeda/t12_create_dataverse_compatible_json.py
210
211
212
213
214
215
216
def fix_mapping(input_: dict[str, str]) -> dict[str, str]:
    out = copy.deepcopy(input_)
    for k, v in input_.items():
        key = get_string_until_capital(k)
        if not input_.get(key):
            out[key] = v.split(".")[0]
    return dict(sorted(out.items()))

get_multiple

get_multiple(row: dict, _=None) -> bool
Source code in tomeda/t12_create_dataverse_compatible_json.py
790
791
792
793
794
795
796
797
798
799
def get_multiple(row: dict, _=None) -> bool:
    if row["allowmultiples"].upper() == "TRUE":
        return True
    elif row["allowmultiples"].upper() == "FALSE":
        return False
    else:
        raise ValueError(
            f'`allowmultiples` has to be either "TRUE" or "FALSE" but is '
            f'{row["allowmultiples"]}'
        )

get_string_until_capital

get_string_until_capital(string: str) -> str
Source code in tomeda/t12_create_dataverse_compatible_json.py
203
204
205
206
207
def get_string_until_capital(string: str) -> str:
    for i in range(len(string)):
        if string[i].isupper():
            return string[:i]
    return string

get_typeclass

get_typeclass(row: dict, controlled_vocab=None) -> str
Source code in tomeda/t12_create_dataverse_compatible_json.py
781
782
783
784
785
786
787
def get_typeclass(row: dict, controlled_vocab=None) -> str:
    if is_controlled_vocab(row, controlled_vocab):
        return "controlledVocabulary"
    elif row["fieldType"] in ["", "none"]:
        return "compound"
    else:
        return "primitive"

get_value

get_value(dataset_key: str, metadata: dict) -> Child

Retrieves a value or values from the provided metadata using the specified dataset key.

Parameters: - dataset_key (str): The key path (dot-separated) to extract data from the metadata. - metadata (dict): The JSON-like dictionary from which data is extracted. - multiple (bool): If True, fetches all matching values. If False, only fetches the first match. - with_path (bool, optional): If True, also returns the full JSONPath of the value(s). Defaults to False.

Returns: - str: If multiple is False and with_path is False, returns the value as a string. - list[str]: If multiple is True and with_path is False, returns a list of matching values. - tuple[str, str]: If multiple is False and with_path is True, returns a tuple of the value and its path. - tuple[str, list[str]]: If multiple is True and with_path is True, returns a tuple of the list of values and their paths.

Raises: - NoMatchError: If no matches are found in the metadata for the provided dataset key.

Example:

data = {"a": {"b": [{"c": "value1"}, {"c": "value2"}]}} get_value("a.b.c", data, multiple=True) (['value1', 'value2'], ['$["a"]["b"][0]["c"]', '$["a"]["b"][1]["c"]'])

Notes: This function utilizes JSONPath to perform the data extraction from the provided metadata.

Source code in tomeda/t12_create_dataverse_compatible_json.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def get_value(
    dataset_key: str,
    metadata: dict,
) -> Child:
    """
    Retrieves a value or values from the provided metadata using the specified dataset key.

    Parameters:
    - dataset_key (str): The key path (dot-separated) to extract data from the metadata.
    - metadata (dict): The JSON-like dictionary from which data is extracted.
    - multiple (bool): If True, fetches all matching values. If False, only fetches the first match.
    - with_path (bool, optional): If True, also returns the full JSONPath of the value(s). Defaults to False.

    Returns:
    - str: If `multiple` is False and `with_path` is False, returns the value as a string.
    - list[str]: If `multiple` is True and `with_path` is False, returns a list of matching values.
    - tuple[str, str]: If `multiple` is False and `with_path` is True, returns a tuple of the value and its path.
    - tuple[str, list[str]]: If `multiple` is True and `with_path` is True, returns a tuple of the list of values and their paths.

    Raises:
    - NoMatchError: If no matches are found in the metadata for the provided dataset key.

    Example:
    >> data = {"a": {"b": [{"c": "value1"}, {"c": "value2"}]}}
    >> get_value("a.b.c", data, multiple=True)
    (['value1', 'value2'], ['$["a"]["b"][0]["c"]', '$["a"]["b"][1]["c"]'])

    Notes:
    This function utilizes JSONPath to perform the data extraction from the provided metadata.
    """

    dataset_key_ = ".".join([key + "[*]" for key in dataset_key.split(".")])

    jsonpath_expr = parse(remove_suffix(dataset_key_, "[*]"))
    matches = jsonpath_expr.find(metadata)

    if len(matches) < 1:
        raise NoMatchError

    # value = [str(match.value) for match in matches]
    # path = [str(match.full_path) for match in matches]
    value = [match.value for match in matches]
    path = [str(match.full_path) for match in matches]

    return Child(value, path)

is_controlled_vocab

is_controlled_vocab(
    row: dict, controlled_vocab: list[dict[str, str]]
) -> bool

The Metadata validation ensures that the controlled vocabulary is a subset of the allowed values. This has not to be checked here.

Source code in tomeda/t12_create_dataverse_compatible_json.py
765
766
767
768
769
770
771
772
773
774
775
776
777
778
def is_controlled_vocab(
    row: dict, controlled_vocab: list[dict[str, str]]
) -> bool:
    """
    The Metadata validation ensures that the controlled vocabulary is a subset of the allowed values.
    This has not to be checked here.
    """
    controlled_vocab_unique = set(
        [vocab["DatasetField"] for vocab in controlled_vocab]
    )

    if row["name"] in controlled_vocab_unique:
        return True
    return False

main

main(param: TomedaParameter) -> None
Source code in tomeda/t12_create_dataverse_compatible_json.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def main(param: TomedaParameter) -> None:
    logger.info("Starting to create dataverse compatible json file.")
    mapping_table = param.mapping_table.with_suffix(".json")
    mapping: dict[str, str] = read_mapping_information(mapping_table)

    # contains the values for name, typeClass, multiple. Values are not
    # included.
    # These can be derived from the tsv files.
    tsv_folders: list[Path] = param.tsv_dir
    tsv_data: TSVElements = read_tsv_data(tsv_folders)

    dataset_metadata_file = param.dataset_metadata.with_suffix(".json")

    logger.info(f"Reading dataset metadata from '{dataset_metadata_file}'...")

    dataset_metadata = read_dataset_metadata(dataset_metadata_file)

    logger.info(f"Creating Dataset TSV file...")

    dataset: dict = assemble_based_on_list_view(
        tsv_data, dataset_metadata, mapping
    )

    logger.info(f"Writing TSV file: {param.output}")
    # output
    write_mapped_dataset(dataset, param.output, param.force_overwrite)
    logger.info(f"Finished.")

read_controlled_vocab

read_controlled_vocab(tsv_file: Path) -> list[dict]
Source code in tomeda/t12_create_dataverse_compatible_json.py
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
def read_controlled_vocab(tsv_file: Path) -> list[dict]:
    logger.info("Reading controlled vocabulary from TSV file '%s'...", tsv_file)
    file_handler = TomedaFileHandler(tsv_file)
    lines = file_handler.read(strip=False)
    if not lines:
        logger.error(f"Failed to read the file {tsv_file}")
        return []

    x = 0
    while not lines[x].startswith("#controlledVocabulary"):
        x += 1
    lines = lines[x:]

    reader = csv.DictReader(lines, delimiter="\t")
    reader.fieldnames = list(
        map(str.strip, reader.fieldnames)
    )  # remove whitespace from fieldnames
    controlled_vocab = [row for row in reader]

    return controlled_vocab

read_dataset_metadata

read_dataset_metadata(path: Path) -> dict
Source code in tomeda/t12_create_dataverse_compatible_json.py
826
827
828
def read_dataset_metadata(path: Path) -> dict:
    content = TomedaFileHandler(path).read(raw=True)
    return json.loads(content[0])

read_mapping_information

read_mapping_information(path: Path) -> dict
Source code in tomeda/t12_create_dataverse_compatible_json.py
831
832
833
def read_mapping_information(path: Path) -> dict:
    content = TomedaFileHandler(path).read(raw=True)
    return json.loads(content[0])

read_tsv_data

read_tsv_data(tsv_folders: list[Path]) -> TSVElements
Source code in tomeda/t12_create_dataverse_compatible_json.py
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
def read_tsv_data(tsv_folders: list[Path]) -> TSVElements:
    fields = {
        "typeName": "name",  # this is used to generate a key for the dict
        "multiple": get_multiple,
        "typeClass": get_typeclass,
        "schema": "metadatablock_id",
        "parent": None,
    }

    data: dict[str, dict[str, str]] = {}
    for tsv_folder in tsv_folders:
        for tsv_file in tsv_folder.glob("*.tsv"):
            extracted_data: dict[str, dict[str, str]] = extract_data(
                tsv_file=tsv_file, instructions=fields
            )
            data.update(extracted_data)  # ToDo: possible overwriting of keys?

    data_return = {}
    for key, value in sorted(data.items()):  # Sort key, needed for later lookup
        data_return[key] = TSVElementInfo(**value)

    return TSVElements(data_return)  # Make sure the keys are sorted

remove_suffix

remove_suffix(s, suffix)
Source code in tomeda/t12_create_dataverse_compatible_json.py
89
90
91
92
def remove_suffix(s, suffix):
    if s.endswith(suffix):
        return s[: -len(suffix)]
    return s

write_mapped_dataset

write_mapped_dataset(
    mapped_dataset, path: Path, overwrite: bool = False
)
Source code in tomeda/t12_create_dataverse_compatible_json.py
652
653
654
655
656
657
658
659
660
661
662
663
def write_mapped_dataset(mapped_dataset, path: Path, overwrite: bool = False):
    if not isinstance(mapped_dataset, dict):
        raise TomedaUnexpectedTypeError(
            mapped_dataset, f"mapped_dataset is not a dict."
        )

    if not isinstance(path, Path):
        raise TomedaNotAPathError(str(path), "path is not a Path object.")

    new_path = path.parent / "metadata_for_upload.json"
    file_handler = TomedaFileHandler(new_path, overwrite=overwrite)
    file_handler.write(json.dumps(mapped_dataset, indent=2))