Source code for encord.dataset

from pathlib import Path
from typing import Dict, Iterable, List, Optional, TextIO, Union

from encord.client import EncordClientDataset
from encord.http.utils import CloudUploadSettings
from encord.orm.cloud_integration import CloudIntegration
from encord.orm.dataset import AddPrivateDataResponse, DataRow
from encord.orm.dataset import Dataset as OrmDataset
from encord.orm.dataset import (
    DatasetAccessSettings,
    DatasetUser,
    DatasetUserRole,
    Image,
    ImageGroupOCR,
    StorageLocation,
)


[docs]class Dataset: """ Access dataset related data and manipulate the dataset. """ def __init__(self, client: EncordClientDataset): self._client = client self._dataset_instance: Optional[OrmDataset] = None @property def dataset_hash(self) -> str: """ Get the dataset hash (i.e. the Dataset ID). """ dataset_instance = self._get_dataset_instance() return dataset_instance.dataset_hash @property def title(self) -> str: dataset_instance = self._get_dataset_instance() return dataset_instance.title @property def description(self) -> str: dataset_instance = self._get_dataset_instance() return dataset_instance.description @property def storage_location(self) -> StorageLocation: dataset_instance = self._get_dataset_instance() return dataset_instance.storage_location @property def data_rows(self) -> List[DataRow]: """ Part of the response of this function can be configured by the :meth:`encord.dataset.Dataset.set_access_settings` method. .. code:: dataset.set_access_settings(DatasetAccessSettings(fetch_client_metadata=True)) print(dataset.data_rows) """ dataset_instance = self._get_dataset_instance() return dataset_instance.data_rows
[docs] def refetch_data(self) -> None: """ The Dataset class will only fetch its properties once. Use this function if you suspect the state of those properties to be dirty. """ self._dataset_instance = self.get_dataset()
[docs] def get_dataset(self) -> OrmDataset: """ This function is exposed for convenience. You are encouraged to use the property accessors instead. """ return self._client.get_dataset()
[docs] def set_access_settings(self, dataset_access_settings: DatasetAccessSettings, *, refetch_data: bool = True) -> None: """ Args: dataset_access_settings: The access settings to use going forward refetch_data: Whether a `refetch_data()` call should follow the update of the dataset access settings. """ self._client.set_access_settings(dataset_access_settings) if refetch_data: self.refetch_data()
[docs] def add_users(self, user_emails: List[str], user_role: DatasetUserRole) -> List[DatasetUser]: """ Add users to dataset. If the user was already added, this operation will succeed but the `user_role` will be unchanged. The existing `user_role` will be reflected in the `DatasetUser` instance. Args: user_emails: list of user emails to be added user_role: the user role to assign to all users """ return self._client.add_users(user_emails, user_role)
[docs] def upload_video( self, file_path: str, cloud_upload_settings: CloudUploadSettings = CloudUploadSettings(), title: Optional[str] = None, ): """ Upload video to Encord storage. Args: file_path: path to video e.g. '/home/user/data/video.mp4' cloud_upload_settings: Settings for uploading data into the cloud. Change this object to overwrite the default values. title: The video title. If unspecified, this will be the file name. This title should include an extension. For example "encord_video.mp4". Returns: Bool. Raises: UploadOperationNotSupportedError: If trying to upload to external datasets (e.g. S3/GPC/Azure) """ return self._client.upload_video(file_path, cloud_upload_settings=cloud_upload_settings, title=title)
[docs] def create_image_group( self, file_paths: Iterable[str], max_workers: Optional[int] = None, cloud_upload_settings: CloudUploadSettings = CloudUploadSettings(), title: Optional[str] = None, *, create_video: bool = True, ): """ Create an image group in Encord storage. Choose this type of image upload for sequential images. Else, you can choose the :meth:`.Dataset.upload_image` function. Args: file_paths: a list of paths to images, e.g. ['/home/user/data/img1.png', '/home/user/data/img2.png'] max_workers: DEPRECATED: This argument will be ignored cloud_upload_settings: Settings for uploading data into the cloud. Change this object to overwrite the default values. title: The title of the image group. If unspecified this will be randomly generated for you. This title should NOT include an extension. For example "encord_image_group". create_video: A flag specifying how image groups are stored. If `True`, a compressed video will be created from the image groups. `True` was the previous default support. If `False`, the images are saved as a sequence of images. Returns: Bool. Raises: UploadOperationNotSupportedError: If trying to upload to external datasets (e.g. S3/GPC/Azure) """ return self._client.create_image_group( file_paths, cloud_upload_settings=cloud_upload_settings, title=title, create_video=create_video, )
[docs] def create_dicom_series( self, file_paths: List[str], cloud_upload_settings: CloudUploadSettings = CloudUploadSettings(), title: Optional[str] = None, ): """ Upload a DICOM series to Encord storage Args: file_paths: a list of paths to DICOM files, e.g. ['/home/user/data/DICOM_1.dcm', '/home/user/data/DICOM_2.dcm'] cloud_upload_settings: Settings for uploading data into the cloud. Change this object to overwrite the default values. title: The title of the DICOM series. If unspecified this will be randomly generated for you. This title should NOT include an extension. For example "encord_image_group". Returns: Bool. Raises: UploadOperationNotSupportedError: If trying to upload to external datasets (e.g. S3/GPC/Azure) """ return self._client.create_dicom_series(file_paths, cloud_upload_settings=cloud_upload_settings, title=title)
[docs] def upload_image( self, file_path: Union[Path, str], title: Optional[str] = None, cloud_upload_settings: CloudUploadSettings = CloudUploadSettings(), ) -> Image: """ Upload a single image to Encord storage. If your images are sequential we recommend creating an image group via the :meth:`.Dataset.create_image_group` function. For more information please compare https://docs.encord.com/docs/annotate/editor/images and https://docs.encord.com/docs/annotate/editor/videos Args: file_path: The file path to the image title: The image title. If unspecified, this will be the file name. This title should include an extension. For example "encord_image.png". cloud_upload_settings: Settings for uploading data into the cloud. Change this object to overwrite the default values. """ return self._client.upload_image(file_path, title, cloud_upload_settings)
[docs] def delete_image_group(self, data_hash: str): """ Delete an image group in Encord storage. Args: data_hash: the hash of the image group to delete """ return self._client.delete_image_group(data_hash)
[docs] def delete_data(self, data_hashes: List[str]): """ Delete a video/image group from a dataset. Args: data_hashes: list of hash of the videos/image_groups you'd like to delete, all should belong to the same dataset """ return self._client.delete_data(data_hashes)
[docs] def add_private_data_to_dataset( self, integration_id: str, private_files: Union[str, Dict, Path, TextIO], ignore_errors: bool = False, ) -> AddPrivateDataResponse: """ Append data hosted on private clouds to existing dataset. For a more complete example of safe uploads, please follow the guide found in our docs under :ref:`https://python.docs.encord.com/tutorials/datasets.html#adding-data-from-a-private-cloud <tutorials/datasets:Adding data from a private cloud>` Args: integration_id: str EntityId of the cloud integration to be used when accessing those files private_files: A str path or Path object to a json file, json str or python dictionary of the files you wish to add ignore_errors: bool, optional Ignore individual errors when trying to access the specified files Returns: add_private_data_response List of DatasetDataInfo objects containing data_hash and title """ return self._client.add_private_data_to_dataset(integration_id, private_files, ignore_errors)
[docs] def update_data_item(self, data_hash: str, new_title: str) -> bool: """ Update a data item Args: data_hash: str Data hash of the item being updated new_title: String containing the new title of the data item being updated Returns: Returns a boolean for whether the update was successful """ return self._client.update_data_item(data_hash, new_title)
[docs] def re_encode_data(self, data_hashes: List[str]): """ Launches an async task that can re-encode a list of videos. Args: data_hashes: list of hash of the videos you'd like to re_encode, all should belong to the same dataset Returns: EntityId(integer) of the async task launched. """ return self._client.re_encode_data(data_hashes)
[docs] def re_encode_data_status(self, job_id: int): """ Returns the status of an existing async task which is aimed at re-encoding videos. Args: job_id: id of the async task that was launched to re-encode the videos Returns: ReEncodeVideoTask: Object containing the status of the task, along with info about the new encoded videos in case the task has been completed """ return self._client.re_encode_data_status(job_id)
[docs] def run_ocr(self, image_group_id: str) -> List[ImageGroupOCR]: """ Returns an optical character recognition result for a given image group Args: image_group_id: the id of the image group in this dataset to run OCR on Returns: Returns a list of ImageGroupOCR objects representing the text and corresponding coordinates found in each frame of the image group """ return self._client.run_ocr(image_group_id)
[docs] def get_cloud_integrations(self) -> List[CloudIntegration]: return self._client.get_cloud_integrations()
def _get_dataset_instance(self): if self._dataset_instance is None: self._dataset_instance = self.get_dataset() return self._dataset_instance