Skip to content

Documentation for ToMeDa

tomeda.t02_upload_to_dataverse

logger module-attribute

logger: TraceLogger = getLogger(__name__)

DatasetExistsWarning

DatasetExistsWarning(value: dict, message: str)

Custom Warning that is raised if the dataset already exists in dataverse

Source code in tomeda/t02_upload_to_dataverse.py
57
58
59
60
61
62
63
64
def __init__(self, value: dict, message: str) -> None:
    self.value = value
    self.message = message
    print(self.message)

    for i, elem in enumerate(self.value["data"]["items"], start=1):
        # print(f"> Search for: {i:3}. {elem['global_id'].split('/')[-1]}")
        print(f"> Search for: {i:3}. {elem['global_id']}")

message instance-attribute

message = message

value instance-attribute

value = value

HTTPResponseError

HTTPResponseError(value: str, message: str)

Bases: Exception

Custom error that is raised if HTTP Return is not 200 ('Ok')

Source code in tomeda/t02_upload_to_dataverse.py
35
36
37
38
def __init__(self, value: str, message: str) -> None:
    self.value = value
    self.message = message
    super().__init__(message)

message instance-attribute

message = message

value instance-attribute

value = value

QueryDictNotValidError

QueryDictNotValidError(
    value: Union[dict[str, str], dict[str, str]],
    message: str,
)

Bases: Exception

Custom error that is rised if the Query dict is not valid

Source code in tomeda/t02_upload_to_dataverse.py
44
45
46
47
48
49
def __init__(
    self, value: Union[dict[str, str], dict[str, str]], message: str
) -> None:
    self.value = value
    self.message = message
    super().__init__(message)

message instance-attribute

message = message

value instance-attribute

value = value

Uploader

Uploader(
    server_url: str,
    target_dataverse: str,
    input_file: str | list[str],
    query_fields: str | list[str],
    api_token: str,
)

Class with the intent to upload data to dataverse. If data already exists, the data is not uploaded

Parameters:

Name Type Description Default
server_url str

Server url of the dataverse without the api path and trailing slash.

required
target_dataverse str

Name of the dataverse (also named collection) to upload to dataverse repository.

required
input_file str | list[str]

Path to the file that should be uploaded. File has to be in json with dataverse keys.

required
query_fields str | list[str]

Fields that are used to check if the dataset already exists. Use 'typeName' of the metadataBlocks 'citation' field. No 'compound' fields!

required
api_token str

The api token of the dataverse user.

required
Source code in tomeda/t02_upload_to_dataverse.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def __init__(
    self,
    server_url: str,
    target_dataverse: str,
    input_file: str | list[str],
    query_fields: str | list[str],
    api_token: str,
):
    """

    Parameters
    ----------
    server_url
        Server url of the dataverse without the api path and trailing slash.
    target_dataverse
        Name of the dataverse (also named collection) to upload to
        dataverse repository.
    input_file
        Path to the file that should be uploaded. File has to be in
        json with dataverse keys.
    query_fields
        Fields that are used to check if the dataset already exists.
        Use 'typeName' of the metadataBlocks 'citation' field.
        No 'compound' fields!
    api_token
        The api token of the dataverse user.
    """
    self.api_token: str = api_token
    self.server_url: str = server_url
    self.collection: str = target_dataverse
    self.query_fields: list[str] = (
        [query_fields] if isinstance(query_fields, str) else query_fields
    )
    self.input_files: list[str] = (
        [input_file] if isinstance(input_file, str) else input_file
    )

api_token instance-attribute

api_token: str = api_token

collection instance-attribute

collection: str = target_dataverse

input_files instance-attribute

input_files: list[str] = (
    [input_file]
    if isinstance(input_file, str)
    else input_file
)

query_fields instance-attribute

query_fields: list[str] = (
    [query_fields]
    if isinstance(query_fields, str)
    else query_fields
)

server_url instance-attribute

server_url: str = server_url

exist_in_dataverse

exist_in_dataverse(input_file: str) -> bool

Function that checks if the dataset already exists in dataverse

Source code in tomeda/t02_upload_to_dataverse.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def exist_in_dataverse(self, input_file: str) -> bool:
    """Function that checks if the dataset already exists in dataverse"""
    if not self.query_fields:
        return False

    query_dict = self.get_keywords_from_upload_file(input_file)
    logger.debug("query_dict=%s", query_dict)
    query = (
        ["q=*"]
        + ["&per_page=1000"]
        + ["&type=dataset"]
        + [
            f"&fq={k.replace(' ', '+')}:{v.replace(' ', '+')}"
            for k, v in query_dict.items()
        ]
    )
    query = "".join(query)
    logger.debug("query=%s", query)

    headers = {"X-Dataverse-key": self.api_token}
    response = requests.get(
        f"{self.server_url}/api/search?{query}", headers=headers
    )
    if response.json()["status"] != "OK":
        raise HTTPResponseError(
            str(response.status_code), "HTTP Response should be 200."
        )

    response_json = response.json()
    total_count = response_json["data"]["total_count"]

    if total_count > 0:
        DatasetExistsWarning(
            response_json,
            f"Entry for '{input_file}' already exists {total_count} times "
            f"in dataverse '{self.collection}'",
        )
        return True
    else:
        return False

get_keywords_from_upload_file

get_keywords_from_upload_file(
    upload_file: str,
) -> dict[str, str]

Extracts the keywords from the upload file

Source code in tomeda/t02_upload_to_dataverse.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def get_keywords_from_upload_file(self, upload_file: str) -> dict[str, str]:
    """
    Extracts the keywords from the upload file
    """
    query_dict = {field: None for field in self.query_fields}
    file_handler = TomedaFileHandler(Path(upload_file))
    data = json.loads(file_handler.read(raw=True)[0])
    # only valid for citation metadata!
    fields = data["datasetVersion"]["metadataBlocks"]["citation"]["fields"]
    for field in fields:
        query_dict = self._extract_keys_from_dataset_field(
            field, query_dict
        )
    for k, v in query_dict.items():
        if not isinstance(k, str) or not isinstance(v, str):
            raise QueryDictNotValidError(
                query_dict,
                "key or value do not seem to be strings! Be sure "
                + "that the comparison key is a primitive value!",
            )
    return query_dict

iterate_through_input_files

iterate_through_input_files() -> None
Source code in tomeda/t02_upload_to_dataverse.py
160
161
162
163
164
165
166
167
168
169
def iterate_through_input_files(self) -> None:
    print("Start iterating...")
    for input_file in self.input_files:
        exists = self.exist_in_dataverse(input_file)
        if exists:
            print("... not uploading...", end="")
        else:
            self.upload(input_file)

    print("...done")

upload

upload(input_file: str) -> None

Function that uploads the given dataset to the dataverse

Source code in tomeda/t02_upload_to_dataverse.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def upload(self, input_file: str) -> None:
    """Function that uploads the given dataset to the dataverse"""
    headers = {
        "X-Dataverse-key": self.api_token,
        "Content-Type": "application/json",
    }
    print("Uploading...", end="")
    with open(input_file, "rb") as data:
        print(data)
        response = requests.post(
            f"{self.server_url}/api/dataverses/{self.collection}/datasets",
            headers=headers,
            data=data,
            timeout=10,
        )
    logger.debug("exists_in_dataverse: response.json()=%s", response.json())

    if response.json()["status"] != "OK":
        raise HTTPResponseError(
            value=str(response.json()),
            message="HTTP Response should be 'OK'.",
        )
    else:
        print(" success")

main

main(param: TomedaParameter) -> None
Source code in tomeda/t02_upload_to_dataverse.py
20
21
22
23
24
25
26
27
28
def main(param: TomedaParameter) -> None:
    uploader = Uploader(
        server_url=param.server_url,
        target_dataverse=param.target_collection,
        input_file=str(param.upload_file),
        query_fields=param.query_fields,
        api_token=param.api_token,
    )
    uploader.iterate_through_input_files()