Source code for encord.dataset

from pathlib import Path
from typing import Dict, Iterable, List, Optional, TextIO, Union

from encord.client import EncordClientDataset
from encord.http.utils import CloudUploadSettings
from encord.orm.cloud_integration import CloudIntegration
from encord.orm.dataset import AddPrivateDataResponse, DataRow
from encord.orm.dataset import Dataset as OrmDataset
from encord.orm.dataset import Image, ImageGroupOCR, StorageLocation


[docs]class Dataset: """ Access dataset related data and manipulate the dataset. """ def __init__(self, client: EncordClientDataset): self._client = client self._dataset_instance: Optional[OrmDataset] = None @property def dataset_hash(self) -> str: """ Get the dataset hash (i.e. the Dataset ID). """ dataset_instance = self._get_dataset_instance() return dataset_instance.dataset_hash @property def title(self) -> str: dataset_instance = self._get_dataset_instance() return dataset_instance.title @property def description(self) -> str: dataset_instance = self._get_dataset_instance() return dataset_instance.description @property def storage_location(self) -> StorageLocation: dataset_instance = self._get_dataset_instance() return dataset_instance.storage_location @property def data_rows(self) -> List[DataRow]: dataset_instance = self._get_dataset_instance() return dataset_instance.data_rows
[docs] def refetch_data(self) -> None: """ The Dataset class will only fetch its properties once. Use this function if you suspect the state of those properties to be dirty. """ self._dataset_instance = self.get_dataset()
[docs] def get_dataset(self) -> OrmDataset: """ This function is exposed for convenience. You are encouraged to use the property accessors instead. """ return self._client.get_dataset()
[docs] def upload_video( self, file_path: str, cloud_upload_settings: CloudUploadSettings = CloudUploadSettings(), title: Optional[str] = None, ): """ Upload video to Encord storage. Args: self: Encord client object. file_path: path to video e.g. '/home/user/data/video.mp4' cloud_upload_settings: Settings for uploading data into the cloud. Change this object to overwrite the default values. title: The video title. If unspecified, this will be the file name. This title should include an extension. For example "encord_video.mp4". Returns: Bool. Raises: UploadOperationNotSupportedError: If trying to upload to external datasets (e.g. S3/GPC/Azure) """ return self._client.upload_video(file_path, cloud_upload_settings=cloud_upload_settings, title=title)
[docs] def create_image_group( self, file_paths: Iterable[str], max_workers: Optional[int] = None, cloud_upload_settings: CloudUploadSettings = CloudUploadSettings(), title: Optional[str] = None, ): """ Create an image group in Encord storage. Choose this type of image upload for sequential images. Else, you can choose the :meth:`.Dataset.upload_image` function. Args: self: Encord client object. file_paths: a list of paths to images, e.g. ['/home/user/data/img1.png', '/home/user/data/img2.png'] max_workers: DEPRECATED: This argument will be ignored cloud_upload_settings: Settings for uploading data into the cloud. Change this object to overwrite the default values. title: The title of the image group. If unspecified this will be randomly generated for you. This title should NOT include an extension. For example "encord_image_group". Returns: Bool. Raises: UploadOperationNotSupportedError: If trying to upload to external datasets (e.g. S3/GPC/Azure) """ return self._client.create_image_group(file_paths, cloud_upload_settings=cloud_upload_settings, title=title)
[docs] def create_dicom_series( self, file_paths: List[str], cloud_upload_settings: CloudUploadSettings = CloudUploadSettings(), title: Optional[str] = None, ): """ Upload a DICOM series to Encord storage Args: self: Encord client object. file_paths: a list of paths to DICOM files, e.g. ['/home/user/data/DICOM_1.dcm', '/home/user/data/DICOM_2.dcm'] cloud_upload_settings: Settings for uploading data into the cloud. Change this object to overwrite the default values. title: The title of the DICOM series. If unspecified this will be randomly generated for you. This title should NOT include an extension. For example "encord_image_group". Returns: Bool. Raises: UploadOperationNotSupportedError: If trying to upload to external datasets (e.g. S3/GPC/Azure) """ return self._client.create_dicom_series(file_paths, cloud_upload_settings=cloud_upload_settings, title=title)
[docs] def upload_image( self, file_path: Union[Path, str], title: Optional[str] = None, cloud_upload_settings: CloudUploadSettings = CloudUploadSettings(), ) -> Image: """ Upload a single image to Encord storage. If your images are sequential we recommend creating an image group via the :meth:`.Dataset.create_image_group` function. For more information please compare https://docs.encord.com/docs/annotate/editor/images and https://docs.encord.com/docs/annotate/editor/videos Args: file_path: The file path to the image title: The image title. If unspecified, this will be the file name. This title should include an extension. For example "encord_image.png". cloud_upload_settings: Settings for uploading data into the cloud. Change this object to overwrite the default values. """ return self._client.upload_image(file_path, title, cloud_upload_settings)
[docs] def delete_image_group(self, data_hash: str): """ Delete an image group in Encord storage. Args: data_hash: the hash of the image group to delete """ return self._client.delete_image_group(data_hash)
[docs] def delete_data(self, data_hashes: List[str]): """ Delete a video/image group from a dataset. Args: self: Encord client object. data_hashes: list of hash of the videos/image_groups you'd like to delete, all should belong to the same dataset """ return self._client.delete_data(data_hashes)
[docs] def add_private_data_to_dataset( self, integration_id: str, private_files: Union[str, Dict, Path, TextIO], ignore_errors: bool = False, ) -> AddPrivateDataResponse: """ Append data hosted on private clouds to existing dataset Args: integration_id: str EntityId of the cloud integration to be used when accessing those files private_files: A str path or Path object to a json file, json str or python dictionary of the files you wish to add ignore_errors: bool, optional Ignore individual errors when trying to access the specified files Returns: add_private_data_response List of DatasetDataInfo objects containing data_hash and title """ return self._client.add_private_data_to_dataset(integration_id, private_files, ignore_errors)
[docs] def update_data_item(self, data_hash: str, new_title: str) -> bool: """ Update a data item Args: data_hash: str Data hash of the item being updated new_title: String containing the new title of the data item being updated Returns: Returns a boolean for whether the update was successful """ return self._client.update_data_item(data_hash, new_title)
[docs] def re_encode_data(self, data_hashes: List[str]): """ Launches an async task that can re-encode a list of videos. Args: self: Encord client object. data_hashes: list of hash of the videos you'd like to re_encode, all should belong to the same dataset Returns: EntityId(integer) of the async task launched. """ return self._client.re_encode_data(data_hashes)
[docs] def re_encode_data_status(self, job_id: int): """ Returns the status of an existing async task which is aimed at re-encoding videos. Args: self: Encord client object. job_id: id of the async task that was launched to re-encode the videos Returns: ReEncodeVideoTask: Object containing the status of the task, along with info about the new encoded videos in case the task has been completed """ return self._client.re_encode_data_status(job_id)
[docs] def run_ocr(self, image_group_id: str) -> List[ImageGroupOCR]: """ Returns an optical character recognition result for a given image group Args: image_group_id: the id of the image group in this dataset to run OCR on Returns: Returns a list of ImageGroupOCR objects representing the text and corresponding coordinates found in each frame of the image group """ return self._client.run_ocr(image_group_id)
[docs] def get_cloud_integrations(self) -> List[CloudIntegration]: return self._client.get_cloud_integrations()
def _get_dataset_instance(self): if self._dataset_instance is None: self._dataset_instance = self.get_dataset() return self._dataset_instance