Source code for encord.orm.dataset

#
# Copyright (c) 2020 Cord Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from __future__ import annotations

import dataclasses
import json
from collections import OrderedDict
from datetime import datetime
from enum import Enum, IntEnum
from typing import Dict, List, Optional

from dateutil import parser

from encord.constants.enums import DataType
from encord.orm import base_orm
from encord.orm.formatter import Formatter

DATETIME_STRING_FORMAT = "%Y-%m-%d %H:%M:%S"


[docs]class DatasetUserRole(IntEnum): ADMIN = 0 USER = 1
[docs]class DataRow(dict, Formatter): def __init__(self, uid: str, title: str, data_type: DataType, created_at: datetime): """ This class has dict-style accessors for backwards compatibility. Clients who are using this class for the first time are encouraged to use the property accessors and setters instead of the underlying dictionary. The mixed use of the `dict` style member functions and the property accessors and setters is discouraged. WARNING: Do NOT use the `.data` member of this class. Its usage could corrupt the correctness of the datastructure. """ super().__init__( { "data_hash": uid, "data_title": title, "data_type": data_type.to_upper_case_string(), "created_at": created_at.strftime(DATETIME_STRING_FORMAT), } ) @property def uid(self) -> str: return self["data_hash"] @uid.setter def uid(self, value: str) -> None: self["data_hash"] = value @property def title(self) -> str: return self["data_title"] @title.setter def title(self, value: str) -> None: self["data_title"] = value @property def data_type(self) -> DataType: return DataType.from_upper_case_string(self["data_type"]) @data_type.setter def data_type(self, value: DataType) -> None: self["data_type"] = value.to_upper_case_string() @property def created_at(self) -> datetime: return parser.parse(self["created_at"]) @created_at.setter def created_at(self, value: datetime) -> None: """Datetime will trim milliseconds for backwards compatibility.""" self["created_at"] = value.strftime(DATETIME_STRING_FORMAT)
[docs] @classmethod def from_dict(cls, json_dict: Dict) -> DataRow: data_type = DataType.from_upper_case_string(json_dict["data_type"]) return DataRow( uid=json_dict["data_hash"], title=json_dict["data_title"], # The API server currently returns upper cased DataType strings. data_type=data_type, created_at=parser.parse(json_dict["created_at"]), )
[docs] @classmethod def from_dict_list(cls, json_list: List) -> List[DataRow]: ret: List[DataRow] = list() for json_dict in json_list: ret.append(cls.from_dict(json_dict)) return ret
[docs]@dataclasses.dataclass(frozen=True) class DatasetInfo: """ This class represents a dataset in the context of listing """ dataset_hash: str user_hash: str title: str description: str type: int created_at: datetime last_edited_at: datetime
[docs]class Dataset(dict, Formatter): def __init__( self, title: str, storage_location: str, data_rows: List[DataRow], dataset_hash: str, description: Optional[str] = None, ): """ DEPRECATED - prefer using the :class:`encord.dataset.Dataset` class instead. This class has dict-style accessors for backwards compatibility. Clients who are using this class for the first time are encouraged to use the property accessors and setters instead of the underlying dictionary. The mixed use of the `dict` style member functions and the property accessors and setters is discouraged. WARNING: Do NOT use the `.data` member of this class. Its usage could corrupt the correctness of the datastructure. """ super().__init__( { "dataset_hash": dataset_hash, "title": title, "description": description, "dataset_type": storage_location, "data_rows": data_rows, } ) @property def dataset_hash(self) -> str: return self["dataset_hash"] @property def title(self) -> str: return self["title"] @title.setter def title(self, value: str) -> None: self["title"] = value @property def description(self) -> str: return self["description"] @description.setter def description(self, value: str) -> None: self["description"] = value @property def storage_location(self) -> StorageLocation: return StorageLocation.from_str(self["dataset_type"]) @storage_location.setter def storage_location(self, value: StorageLocation) -> None: self["dataset_type"] = value.get_str() @property def data_rows(self) -> List[DataRow]: return self["data_rows"] @data_rows.setter def data_rows(self, value: List[DataRow]) -> None: self["data_rows"] = value
[docs] @classmethod def from_dict(cls, json_dict: Dict) -> Dataset: return Dataset( title=json_dict["title"], description=json_dict["description"], storage_location=json_dict["dataset_type"], dataset_hash=json_dict["dataset_hash"], data_rows=DataRow.from_dict_list(json_dict.get("data_rows", [])), )
[docs]@dataclasses.dataclass(frozen=True) class DatasetDataInfo(Formatter): data_hash: str title: str
[docs] @classmethod def from_dict(cls, json_dict: Dict) -> DatasetDataInfo: return DatasetDataInfo(json_dict["data_hash"], json_dict["title"])
[docs]@dataclasses.dataclass(frozen=True) class AddPrivateDataResponse(Formatter): """Response of add_private_data_to_dataset""" dataset_data_list: List[DatasetDataInfo]
[docs] @classmethod def from_dict(cls, json_dict: Dict) -> AddPrivateDataResponse: data_info = json_dict["dataset_data_info"] dataset_data_info_list = [] for mapping in data_info: dataset_data_info_list.append(DatasetDataInfo.from_dict(mapping)) return AddPrivateDataResponse(dataset_data_info_list)
[docs]@dataclasses.dataclass(frozen=True) class DatasetAPIKey(Formatter): dataset_hash: str api_key: str title: str key_hash: str scopes: List[DatasetScope]
[docs] @classmethod def from_dict(cls, json_dict: Dict) -> DatasetAPIKey: if isinstance(json_dict["scopes"], str): json_dict["scopes"] = json.loads(json_dict["scopes"]) scopes = [DatasetScope(scope) for scope in json_dict["scopes"]] return DatasetAPIKey( json_dict["resource_hash"], json_dict["api_key"], json_dict["title"], json_dict["key_hash"], scopes, )
[docs]class CreateDatasetResponse(dict, Formatter): def __init__( self, title: str, storage_location: int, dataset_hash: str, user_hash: str, ): """ This class has dict-style accessors for backwards compatibility. Clients who are using this class for the first time are encouraged to use the property accessors and setters instead of the underlying dictionary. The mixed use of the `dict` style member functions and the property accessors and setters is discouraged. WARNING: Do NOT use the `.data` member of this class. Its usage could corrupt the correctness of the datastructure. """ super().__init__( { "title": title, "type": storage_location, "dataset_hash": dataset_hash, "user_hash": user_hash, } ) @property def title(self) -> str: return self["title"] @title.setter def title(self, value: str) -> None: self["title"] = value @property def storage_location(self) -> StorageLocation: return StorageLocation(self["type"]) @storage_location.setter def storage_location(self, value: StorageLocation) -> None: self["type"] = value.value @property def dataset_hash(self) -> str: return self["dataset_hash"] @dataset_hash.setter def dataset_hash(self, value: str) -> None: self["dataset_hash"] = value @property def user_hash(self) -> str: return self["user_hash"] @user_hash.setter def user_hash(self, value: str) -> None: self["user_hash"] = value
[docs] @classmethod def from_dict(cls, json_dict: Dict) -> CreateDatasetResponse: return CreateDatasetResponse( title=json_dict["title"], storage_location=json_dict["type"], dataset_hash=json_dict["dataset_hash"], user_hash=json_dict["user_hash"], )
[docs]class StorageLocation(IntEnum): CORD_STORAGE = (0,) AWS = (1,) GCP = (2,) AZURE = 3 OTC = 4
[docs] @staticmethod def from_str(string_location: str) -> StorageLocation: return STORAGE_LOCATION_BY_STR[string_location]
[docs] def get_str(self) -> str: if self == StorageLocation.CORD_STORAGE: return "CORD_STORAGE" elif self == StorageLocation.AWS: return "AWS_S3" elif self == StorageLocation.GCP: return "GCP_STR" elif self == StorageLocation.AZURE: return "AZURE_STR" elif self == StorageLocation.OTC: return "OTC_STR"
STORAGE_LOCATION_BY_STR: Dict[str, StorageLocation] = {location.get_str(): location for location in StorageLocation} DatasetType = StorageLocation """For backwards compatibility"""
[docs]class DatasetScope(Enum): READ = "dataset.read" WRITE = "dataset.write"
[docs]class DatasetData(base_orm.BaseORM): """ Video base ORM. """ DB_FIELDS = OrderedDict( [ ("data_hash", str), ("video", dict), ("images", list), ] )
[docs]class SignedVideoURL(base_orm.BaseORM): """A signed URL object with supporting information.""" DB_FIELDS = OrderedDict([("signed_url", str), ("data_hash", str), ("title", str), ("file_link", str)])
[docs]class SignedImageURL(base_orm.BaseORM): """A signed URL object with supporting information.""" DB_FIELDS = OrderedDict([("signed_url", str), ("data_hash", str), ("title", str), ("file_link", str)])
[docs]class SignedImagesURL(base_orm.BaseListORM): """A signed URL object with supporting information.""" BASE_ORM_TYPE = SignedImageURL
[docs]class SignedDicomURL(base_orm.BaseORM): """A signed URL object with supporting information.""" DB_FIELDS = OrderedDict([("signed_url", str), ("data_hash", str), ("title", str), ("file_link", str)])
[docs]class SignedDicomsURL(base_orm.BaseListORM): """A signed URL object with supporting information.""" BASE_ORM_TYPE = SignedDicomURL
[docs]class Video(base_orm.BaseORM): """A video object with supporting information.""" DB_FIELDS = OrderedDict( [ ("data_hash", str), ("title", str), ("file_link", str), ] ) NON_UPDATABLE_FIELDS = { "data_hash", }
[docs]class ImageGroup(base_orm.BaseORM): """An image group object with supporting information.""" DB_FIELDS = OrderedDict( [ ("data_hash", str), ("title", str), ("file_link", str), ] ) NON_UPDATABLE_FIELDS = { "data_hash", }
[docs]class Image(base_orm.BaseORM): """An image object with supporting information.""" DB_FIELDS = OrderedDict( [ ("data_hash", str), ("title", str), ("file_link", str), ] ) NON_UPDATABLE_FIELDS = { "data_hash", }
[docs]class SingleImage(Image): """For native single image upload.""" success: bool
[docs]@dataclasses.dataclass(frozen=True) class Images: """Uploading multiple images in a batch mode.""" success: bool
[docs]@dataclasses.dataclass(frozen=True) class DicomSeries: data_hash: str title: str
[docs]@dataclasses.dataclass(frozen=True) class ImageGroupOCR: processed_texts: Dict
[docs]@dataclasses.dataclass(frozen=True) class ReEncodeVideoTaskResult: data_hash: str # The signed url is only present when using StorageLocation.CORD_STORAGE signed_url: Optional[str] bucket_path: str
[docs]@dataclasses.dataclass(frozen=True) class ReEncodeVideoTask(Formatter): """A re encode video object with supporting information.""" status: str result: List[ReEncodeVideoTaskResult] = None
[docs] @classmethod def from_dict(cls, json_dict: Dict): if "result" in json_dict: dict_results = json_dict["result"] results = [ ReEncodeVideoTaskResult(result["data_hash"], result.get("signed_url"), result["bucket_path"]) for result in dict_results ] return ReEncodeVideoTask(json_dict["status"], results) else: return ReEncodeVideoTask(json_dict["status"])