Reference datasets

This file describes methods associated with dataset creation and metadata.

WildlifeDataset

Base class for creating datasets.

Attributes:

Name	Type	Description
`df`	`DataFrame`	A full dataframe of the data.
`summary`	`dict`	Summary of the dataset.
`root`	`str`	Root directory for the data.
`update_wrong_labels(bool)`	`str`	Whether `fix_labels` should be called.
`unknown_name`	`str`	Name of the unknown class.
`outdated_dataset`	`bool`	Tracks whether dataset was replaced by a new version.
`determined_by_df`	`bool`	Specifies whether dataset is completely determined by its dataframe.
`saved_to_system_folder`	`bool`	Specifies whether dataset is saved to system (hidden) folders.
`remove_columns`	`bool`	Specifies whether constant columns are removed in `finalize_catalogue`.
`check_files`	`bool`	Specifies whether files should be checks for existence in `finalize_catalogue`.
`transform`	`Callable`	Applied transform when loading the image.
`img_load`	`str`	Applied transform when loading the image.
`labels_string`	`List[str]`	List of labels in strings.
`col_path`	`str`	Column name containing image paths.
`col_label`	`str`	Column name containing individual animal names (labels).

Source code in wildlife_datasets/datasets/datasets.py

class WildlifeDataset:
    """Base class for creating datasets.

    Attributes:    
      df (pd.DataFrame): A full dataframe of the data.
      summary (dict): Summary of the dataset.
      root (str): Root directory for the data.
      update_wrong_labels(bool): Whether `fix_labels` should be called.
      unknown_name (str): Name of the unknown class.
      outdated_dataset (bool): Tracks whether dataset was replaced by a new version.
      determined_by_df (bool): Specifies whether dataset is completely determined by its dataframe.
      saved_to_system_folder (bool): Specifies whether dataset is saved to system (hidden) folders.
      remove_columns (bool): Specifies whether constant columns are removed in `finalize_catalogue`.
      check_files (bool): Specifies whether files should be checks for existence in `finalize_catalogue`.
      transform (Callable): Applied transform when loading the image.
      img_load (str): Applied transform when loading the image.
      labels_string (List[str]): List of labels in strings.
      col_path (str): Column name containing image paths.
      col_label (str): Column name containing individual animal names (labels).
    """

    unknown_name = 'unknown'
    outdated_dataset = False
    determined_by_df = True
    saved_to_system_folder = False
    download_warning = '''You are trying to download an already downloaded dataset.
        This message may have happened to due interrupted download or extract.
        To force the download use the `force=True` keyword such as
        get_data(..., force=True) or download(..., force=True).
        '''
    download_mark_name = 'already_downloaded'
    license_file_name = 'LICENSE_link'

    def __init__(
            self, 
            root: Optional[str] = None,
            df: Optional[pd.DataFrame] = None,
            update_wrong_labels: bool = True,
            transform: Optional[Callable] = None,
            img_load: str = "full",
            remove_unknown: bool = False,
            remove_columns: bool = False,
            check_files: bool = True,
            load_label: bool = False,
            col_path: str = "path",
            col_label: str = "identity",            
            **kwargs) -> None:
        """Initializes the class.

        If `df` is specified, it copies it. Otherwise, it creates it
        by the `create_catalogue` method.

        Args:
            root (Optional[str], optional): Root directory for the data.
            df (Optional[pd.DataFrame], optional): A full dataframe of the data.
            update_wrong_labels (bool, optional): Whether `fix_labels` should be called.
            transform (Optional[Callable], optional): Applied transform when loading the image.
            img_load (str, optional): Applied transform when loading the image.
            remove_unknown (bool, optional): Whether unknown identities should be removed.
            remove_columns (bool, optional): Whether constant columns are removed in `finalize_catalogue`.
            check_files (bool, optional): Whether files should be checks for existence in `finalize_catalogue`.
            load_label (bool, optional): Whether dataset[k] should return only image or also identity.
            col_path (str, optional): Column name containing image paths.
            col_label (str, optional): Column name containing individual animal names (labels).
        """

        if not self.saved_to_system_folder and not root is None and not os.path.exists(root):
            raise Exception('root does not exist. You may have have mispelled it.')
        if self.outdated_dataset:
            print('This dataset is outdated. You may want to call a newer version such as %sv2.' % self.__class__.__name__)
        self.update_wrong_labels = update_wrong_labels
        self.root = root
        self.col_path = col_path
        self.col_label = col_label
        self.remove_columns = remove_columns
        self.check_files = check_files
        # If df is not provided, create it
        if df is None:
            df = self.create_catalogue(**kwargs)
        else:
            if not self.determined_by_df:
                print('This dataset is not determined by dataframe. But you construct it so.')
        if remove_unknown:
            df = df[df[self.col_label] != self.unknown_name]
        self.df = df.reset_index(drop=True)
        self.transform = transform
        self.img_load = img_load
        if self.img_load == "auto":
            if "segmentation" in self.df:
                self.img_load = "bbox_mask"
            elif "bbox" in self.df:
                self.img_load = "bbox"
            else:
                self.img_load = "full"
        self.load_label = load_label

    @property
    def labels_string(self):
        return self.df[self.col_label].astype(str).to_numpy()

    @property
    def num_classes(self):
        return self.df[self.col_label].nunique()

    @property
    def metadata(self):
        return self.df

    @metadata.setter
    def metadata(self, value):
        self.df = value	

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int) -> Image:
        """Load an image with iloc `idx` with transforms `self.transform` and `self.img_load` applied.

        Args:
            idx (int): Index of the image.

        Returns:
            Loaded image.
        """

        img = self.get_image(idx)
        img = self.apply_segmentation(img, idx)
        if self.load_label:
            return img, self.df[self.col_label].iloc[idx]
        else:
            return img

    def get_subset(self, idx: Union[List[int], List[bool]]) -> WildlifeDataset:
        """Returns a subset of the class.

        Args:
            idx (Union[List[int], List[bool]]): Indices in the dataframe of the subset.

        Returns:
            The subset class.
        """

        dataset = deepcopy(self)
        if len(self) == len(idx):
            dataset.df = dataset.df[idx].reset_index(drop=True)
        else:
            dataset.df = dataset.df.loc[idx].reset_index(drop=True)
        return dataset

    def get_image(self, idx: int) -> Image:
        """Load an image with iloc `idx`.

        Args:
            idx (int): Index of the image.

        Returns:
            Loaded image.
        """

        data = self.df.iloc[idx]
        if self.root:
            img_path = os.path.join(self.root, data[self.col_path])
        else:
            img_path = data[self.col_path]
        img = self.load_image(img_path)
        return img

    def load_image(self, path: str) -> Image:
        """Load an image with `path`.

        Args:
            path (str): Path to the image.

        Returns:
            Loaded image.
        """

        return utils.load_image(path)

    def apply_segmentation(self, img: Image, idx: int) -> Image:
        """Applies segmentation or bounding box when loading an image.

        Args:
            img (Image): Loaded image.
            idx (int): Index of the image.

        Returns:
            Loaded image.
        """

        # Prepare for segmentations        
        if self.img_load in ["full_mask", "full_hide", "bbox_mask", "bbox_hide"]:
            data = self.df.iloc[idx]
            if not ("segmentation" in data):
                raise ValueError(f"{self.img_load} selected but no segmentation found.")
            segmentation = data["segmentation"]
            if isinstance(segmentation, list) or isinstance(segmentation, np.ndarray):
                # Convert polygon to compressed RLE
                w, h = img.size
                rles = mask_coco.frPyObjects([segmentation], h, w)
                segmentation = mask_coco.merge(rles)
            elif isinstance(segmentation, dict) and (isinstance(segmentation['counts'], list) or isinstance(segmentation['counts'], np.ndarray)):            
                # Convert uncompressed RLE to compressed RLE
                h, w = segmentation['size']
                segmentation = mask_coco.frPyObjects(segmentation, h, w)
            elif isinstance(segmentation, str):
                # Load image mask and convert it to compressed RLE
                segmentation = np.asfortranarray(utils.load_image(os.path.join(self.root, segmentation)))
                if segmentation.ndim == 3:
                    segmentation = segmentation[:,:,0]
                segmentation = mask_coco.encode(segmentation)
            elif not np.any(pd.isnull(segmentation)):
                raise Exception('Segmentation type not recognized')
        # Prepare for bounding boxes
        if self.img_load in ["bbox"]:
            data = self.df.iloc[idx]
            if not ("bbox" in data):
                raise ValueError(f"{self.img_load} selected but no bbox found.")
            if type(data["bbox"]) == str:
                bbox = json.loads(data["bbox"])
            else:
                bbox = data["bbox"]

        # Load full image as it is.
        if self.img_load == "full":
            img = img
        # Mask background using segmentation mask.
        elif self.img_load == "full_mask":
            if not np.any(pd.isnull(segmentation)):
                mask = mask_coco.decode(segmentation).astype("bool")
                img = Image.fromarray(img * mask[..., np.newaxis])
        # Hide object using segmentation mask
        elif self.img_load == "full_hide":
            if not np.any(pd.isnull(segmentation)):
                mask = mask_coco.decode(segmentation).astype("bool")
                img = Image.fromarray(img * ~mask[..., np.newaxis])
        # Crop to bounding box
        elif self.img_load == "bbox":
            if not np.any(pd.isnull(bbox)):
                img = img.crop((bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]))
        # Mask background using segmentation mask and crop to bounding box.
        elif self.img_load == "bbox_mask":
            if (not np.any(pd.isnull(segmentation))):
                mask = mask_coco.decode(segmentation).astype("bool")
                img = Image.fromarray(img * mask[..., np.newaxis])
                img = utils.crop_black(img)
        # Hide object using segmentation mask and crop to bounding box.
        elif self.img_load == "bbox_hide":
            if (not np.any(pd.isnull(segmentation))):
                mask = mask_coco.decode(segmentation).astype("bool")
                img = Image.fromarray(img * ~mask[..., np.newaxis])
                img = utils.crop_black(img)
        # Crop black background around images
        elif self.img_load == "crop_black":
            img = utils.crop_black(img)
        # Crop white background around images
        elif self.img_load == "crop_white":
            img = utils.crop_white(img)
        else:
            raise ValueError(f"Invalid img_load argument: {self.img_load}")

        if self.transform:
            img = self.transform(img)

        return img

    @classmethod
    def get_data(
            cls,
            root: str,
            force: bool = False,
            **kwargs
            ) -> None:
        """Downloads and extracts the data. Wrapper around `cls._download` and `cls._extract.`

        Args:
            root (str): Where the data should be stored.
            force (bool, optional): It the root exists, whether it should be overwritten.
        """

        dataset_name = cls.__name__
        mark_file_name = os.path.join(root, cls.download_mark_name)

        already_downloaded = os.path.exists(mark_file_name)
        if not cls.saved_to_system_folder and already_downloaded and not force:
            print('DATASET %s: DOWNLOADING STARTED.' % dataset_name)
            print(cls.download_warning)
        else:
            print('DATASET %s: DOWNLOADING STARTED.' % dataset_name)
            cls.download(root, force=force, **kwargs)
            print('DATASET %s: EXTRACTING STARTED.' % dataset_name)
            cls.extract(root,  **kwargs)
            print('DATASET %s: FINISHED.\n' % dataset_name)

    @classmethod
    def download(
            cls,
            root: str,
            force: bool = False,
            **kwargs
            ) -> None:
        """Downloads the data. Wrapper around `cls._download`.

        Args:
            root (str): Where the data should be stored.
            force (bool, optional): It the root exists, whether it should be overwritten.
        """

        dataset_name = cls.__name__
        mark_file_name = os.path.join(root, cls.download_mark_name)

        already_downloaded = os.path.exists(mark_file_name)
        if cls.saved_to_system_folder:
            cls._download(**kwargs)
        elif already_downloaded and not force:
            print('DATASET %s: DOWNLOADING STARTED.' % dataset_name)            
            print(cls.download_warning)
        else:
            if os.path.exists(mark_file_name):
                os.remove(mark_file_name)
            with utils.data_directory(root):
                cls._download(**kwargs)
            open(mark_file_name, 'a').close()
            if hasattr(cls, 'summary') and 'licenses_url' in cls.summary:
                with open(os.path.join(root, cls.license_file_name), 'w') as file:
                    file.write(cls.summary['licenses_url'])

    @classmethod    
    def extract(cls, root: str, **kwargs) -> None:
        """Extract the data. Wrapper around `cls._extract`.

        Args:
            root (str): Where the data should be stored.
        """

        if cls.saved_to_system_folder:
            cls._extract(**kwargs)
        else:
            with utils.data_directory(root):
                cls._extract(**kwargs)
            mark_file_name = os.path.join(root, cls.download_mark_name)
            open(mark_file_name, 'a').close()

    @classmethod
    def display_name(cls) -> str:
        """Returns name of the dataset without the v2 ending.

        Returns:
            Name of the dataset.
        """

        cls_parent = cls.__bases__[0]
        while cls_parent != object and cls_parent.outdated_dataset:
            cls = cls_parent
            cls_parent = cls.__bases__[0]            
        return cls.__name__

    def _download(self):
        """Downloads the dataset. Needs to be implemented by subclasses.

        Raises:
            NotImplementedError: Needs to be implemented by subclasses.
        """

        raise NotImplementedError('Needs to be implemented by subclasses.')

    def _extract(self):
        """Extracts the dataset. Needs to be implemented by subclasses.

        Raises:
            NotImplementedError: Needs to be implemented by subclasses.
        """

        raise NotImplementedError('Needs to be implemented by subclasses.')

    def create_catalogue(self):
        """Creates the dataframe.

        Raises:
            NotImplementedError: Needs to be implemented by subclasses.
        """

        raise NotImplementedError('Needs to be implemented by subclasses.')

    def fix_labels(self, df: pd.DataFrame) -> pd.DataFrame:
        """Fixes labels in dataframe.

        Automatically called in `finalize_catalogue`.                
        """

        return df

    def fix_labels_replace_identity(
            self,
            df: pd.DataFrame,
            replace_identity: List[Tuple],
            col: str = 'identity'
            ) -> pd.DataFrame:
        """Replaces all instances of identities.

        Args:
            df (pd.DataFrame): A full dataframe of the data.
            replace_identity (List[Tuple]): List of (old_identity, new_identity)
            col (str, optional): Column to replace in.

        Returns:
            A full dataframe of the data.
        """
        for old_identity, new_identity in replace_identity:
            df[col] = df[col].replace({old_identity: new_identity})
        return df

    def fix_labels_remove_identity(
            self,
            df: pd.DataFrame,
            identities_to_remove: List,
            col: str = 'identity'
            ) -> pd.DataFrame:
        """Removes all instances of identities.

        Args:
            df (pd.DataFrame): A full dataframe of the data.
            identities_to_remove (List): List of identities to remove.
            col (str, optional): Column to remove from.

        Returns:
            A full dataframe of the data.
        """
        idx_remove = [identity in identities_to_remove for identity in df[col]]
        return df[~np.array(idx_remove)]

    def fix_labels_replace_images(
            self,
            df: pd.DataFrame,
            replace_identity: List[Tuple],
            col: str = 'identity'
            ) -> pd.DataFrame:
        """Replaces specified images with specified identities.

        It looks for a subset of image_name in df[self.col_path].
        It may cause problems with `os.path.sep`.

        Args:
            df (pd.DataFrame): A full dataframe of the data.
            replace_identity (List[Tuple]): List of (image_name, old_identity, new_identity).
            col (str, optional): Column to replace in.

        Returns:
            A full dataframe of the data.
        """
        for image_name, old_identity, new_identity in replace_identity:
            n_replaced = 0
            for index, df_row in df.iterrows():
                # Check that there is a image with the required name and identity 
                if image_name in df_row[self.col_path] and old_identity == df_row[col]:
                    df.loc[index, col] = new_identity
                    n_replaced += 1
            if n_replaced == 0:
                print('File name %s with identity %s was not found.' % (image_name, str(old_identity)))
            elif n_replaced > 1:
                print('File name %s with identity %s was found multiple times.' % (image_name, str(old_identity)))
        return df

    def finalize_catalogue(
            self,
            df: pd.DataFrame,
            ) -> pd.DataFrame:
        """Reorders the dataframe and check file paths.

        Reorders the columns and removes constant columns.
        Checks if columns are in correct formats.
        Checks if ids are unique and if all files exist.

        Args:
            df (pd.DataFrame): A full dataframe of the data.

        Returns:
            A full dataframe of the data, slightly modified.
        """

        if self.update_wrong_labels:
            df = self.fix_labels(df)
        self.rename_column(df, 'path', self.col_path)
        self.rename_column(df, 'identity', self.col_label)
        self.check_required_columns(df)
        self.check_types_columns(df)
        df = self.reorder_df(df)
        if self.remove_columns:
            df = self.remove_constant_columns(df)
        self.check_unique_id(df)
        if self.check_files:
            self.check_files_exist(df[self.col_path])
            self.check_files_names(df[self.col_path])
            if 'segmentation' in df.columns:
                self.check_files_exist(df['segmentation'])
        return df

    def rename_column(self, df, name_old, name_new):
        if name_old != name_new:
            if name_new in df.columns:
                raise Exception(f'Column {name_old} already present in dataframe. Cannot rename {name_old} to it.')
            else:
                return df.rename({name_old: name_new}, axis=1, inplace=True)

    def check_required_columns(self, df: pd.DataFrame) -> None:
        """Check if all required columns are present.

        Args:
            df (pd.DataFrame): A full dataframe of the data.
        """

        for col_name in ['image_id', self.col_label, self.col_path]:
            if col_name not in df.columns:
                raise(Exception('Column %s must be in the dataframe columns.' % col_name))

    def check_types_columns(self, df: pd.DataFrame) -> None:
        """Checks if columns are in correct formats.

        The format are specified in `requirements`, which is list
        of tuples. The first value is the name of the column
        and the second value is a list of formats. The column
        must be at least one of the formats.

        Args:
            df (pd.DataFrame): A full dataframe of the data.
        """

        requirements = [
            ('image_id', ['int', 'str']),
            (self.col_label, ['int', 'str']),
            (self.col_path, ['str']),
            ('bbox', ['list_numeric']),
            ('date', ['date']),
            ('keypoints', ['list_numeric']),
            ('position', ['str']),
            ('species', ['str', 'list']),
            ('video', ['int']),
        ]
        # Verify if the columns are in correct formats
        for col_name, allowed_types in requirements:
            if col_name in df.columns:
                # Remove empty values to be sure
                col = df[col_name][~df[col_name].isnull()]
                if len(col) > 0:
                    self.check_types_column(col, col_name, allowed_types)

    def check_types_column(self, col: pd.Series, col_name: str, allowed_types: List[str]) -> None:
        """Checks if the column `col` is in the format `allowed_types`.

        Args:
            col (pd.Series): Column to be checked.
            col_name (str): Column name used only for raising exceptions.
            allowed_types (List[str]): List of strings with allowed values:
                `int` (all values must be integers),
                `str` (strings),
                `list` (lists),
                `list_numeric` (lists with numeric values),
                `date` (dates as tested by `pd.to_datetime`).
        """

        if 'int' in allowed_types and pd.api.types.is_integer_dtype(col):
            return None
        if 'str' in allowed_types and pd.api.types.is_string_dtype(col):
            return None
        if 'list' in allowed_types and pd.api.types.is_list_like(col):
            check = True
            for val in col:
                if not pd.api.types.is_list_like(val):
                    check = False
                    break
            if check:                
                return None        
        if 'list_numeric' in allowed_types and pd.api.types.is_list_like(col):
            check = True
            for val in col:            
                if not pd.api.types.is_list_like(val) and not pd.api.types.is_numeric_dtype(pd.Series(val)):
                    check = False
                    break
            if check:                
                return None
        if 'date' in allowed_types:
            try:
                pd.to_datetime(col)
                return None
            except:
                pass
        raise(Exception('Column %s has wrong type. Allowed types = %s' % (col_name, str(allowed_types))))

    def reorder_df(self, df: pd.DataFrame) -> pd.DataFrame:
        """Reorders rows and columns in the dataframe.

        Rows are sorted based on id.
        Columns are reorder based on the `default_order` list.

        Args:
            df (pd.DataFrame): A full dataframe of the data.

        Returns:
            A full dataframe of the data, slightly modified.
        """

        default_order = ['image_id', self.col_label, self.col_path, 'bbox', 'date', 'keypoints', 'orientation', 'segmentation', 'species']
        df_names = list(df.columns)
        col_names = []
        for name in default_order:
            if name in df_names:
                col_names.append(name)
        for name in df_names:
            if name not in default_order:
                col_names.append(name)

        df = df.sort_values('image_id').reset_index(drop=True)
        return df.reindex(columns=col_names)

    def remove_constant_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """Removes columns with a single unique value.

        Args:
            df (pd.DataFrame): A full dataframe of the data.

        Returns:
            A full dataframe of the data, slightly modified.
        """ 

        for df_name in list(df.columns):
            if df[df_name].astype('str').nunique() == 1:
                df = df.drop([df_name], axis=1)
        return df

    def check_unique_id(self, df: pd.DataFrame) -> None:
        """Checks if values in the id column are unique.

        Args:
            df (pd.DataFrame): A full dataframe of the data.
        """

        if len(df['image_id'].unique()) != len(df):
            raise(Exception('Image ID not unique.'))

    def check_files_exist(self, col: pd.Series) -> None:
        """Checks if paths in a given column exist.

        Args:
            col (pd.Series): A column of a dataframe.
        """

        for path in col:
            if type(path) == str and not os.path.exists(os.path.join(self.root, path)):
                raise(Exception('Path does not exist:' + os.path.join(self.root, path)))

    def check_files_names(self, col: pd.Series) -> None:
        """Checks if paths contain .

        Args:
            col (pd.Series): A column of a dataframe.
        """

        for path in col:
            try:
                path.encode("iso-8859-1")
            except UnicodeEncodeError:
                raise(Exception('Characters in path may cause problems. Please use only ISO-8859-1 characters: ' + os.path.join(path)))

    def plot_grid(
            self,
            n_rows: int = 5,
            n_cols: int = 8,
            offset: float = 10,
            img_min: float = 100,
            rotate: bool = True,
            header_cols: Optional[List[str]] = None,
            idx: Optional[Union[List[bool],List[int]]] = None,
            background_color: Tuple[int] = (0, 0, 0),
            **kwargs
            ) -> None:
        """Plots a grid of size (n_rows, n_cols) with images from the dataframe.

        Args:
            n_rows (int, optional): The number of rows in the grid.
            n_cols (int, optional): The number of columns in the grid.
            offset (float, optional): The offset between images.
            img_min (float, optional): The minimal size of the plotted images.
            rotate (bool, optional): Rotates the images to have the same orientation.
            header_cols (Optional[List[str]], optional): List of headers for each column.
            idx (Optional[Union[List[bool],List[int]]], optional): List of indices to plot. None plots random images. Index -1 plots an empty image.
            background_color (Tuple[int], optional): Background color of the grid.
        """

        if len(self.df) == 0:
            return None

        # Select indices of images to be plotted
        if idx is None:
            n = min(len(self.df), n_rows*n_cols)
            idx = np.random.permutation(len(self.df))[:n]
        else:
            if isinstance(idx, pd.Series):
                idx = idx.values
            if isinstance(idx[0], (bool, np.bool_)):
                idx = np.where(idx)[0]
            n = min(np.array(idx).size, n_rows*n_cols)
            idx = np.matrix.flatten(np.array(idx))[:n]

        # Load images and compute their ratio
        ratios = []
        ims = []
        for k in idx:
            if k >= 0:
                # Load the image with index k
                if self.load_label:
                    im, _ = self[k]
                else:
                    im = self[k]
                ims.append(im)
                ratios.append(im.size[0] / im.size[1])
            else:
                # Load a black image
                ims.append(Image.fromarray(np.zeros((2, 2), dtype = "uint8")))

        # Safeguard when all indices are -1
        if len(ratios) == 0:
            return None

        # Get the size of the images after being resized
        ratio = np.median(ratios)
        if ratio > 1:    
            img_w, img_h = int(img_min*ratio), int(img_min)
        else:
            img_w, img_h = int(img_min), int(img_min/ratio)

        # Compute height offset if headers are present
        if header_cols is not None:
            offset_h = 30
            if len(header_cols) != n_cols:
                raise(Exception("Length of header_cols must be the same as n_cols."))
        else:
            offset_h = 0

        # Create an empty image grid
        im_grid = Image.new('RGB', (n_cols*img_w + (n_cols-1)*offset, offset_h + n_rows*img_h + (n_rows-1)*offset), background_color)

        # Fill the grid image by image
        pos_y = offset_h
        for i in range(n_rows):
            row_h = 0
            for j in range(n_cols):
                k = (n_cols)*i + j
                if k < n:
                    # Possibly rotate the image
                    im = ims[k]
                    if rotate and ((ratio > 1 and im.size[0] < im.size[1]) or (ratio < 1 and im.size[0] > im.size[1])):
                        im = im.transpose(Image.Transpose.ROTATE_90)

                    # Rescale the image
                    im.thumbnail((img_w,img_h))
                    row_h = max(row_h, im.size[1])

                    # Place the image on the grid
                    pos_x = j*img_w + j*offset
                    im_grid.paste(im, (pos_x,pos_y))
            if row_h > 0:
                pos_y += row_h + offset
        im_grid = im_grid.crop((0, 0, im_grid.size[0], pos_y-offset))

        # Plot the image and add column headers if present
        fig = plt.figure()
        fig.patch.set_visible(False)
        ax = fig.add_subplot(111)
        plt.axis('off')
        plt.imshow(im_grid)
        if header_cols is not None:
            color = kwargs.pop('color', 'white')
            ha = kwargs.pop('ha', 'center')
            va = kwargs.pop('va', 'center')
            for i, header in enumerate(header_cols):
                pos_x = (i+0.5)*img_w + i*offset
                pos_y = offset_h/2
                plt.text(pos_x, pos_y, str(header), color=color, ha=ha, va=va, **kwargs)
        return fig

`getitem(idx)`

Load an image with iloc idx with transforms self.transform and self.img_load applied.

Parameters:

Name	Type	Description	Default
`idx`	`int`	Index of the image.	required

Returns:

Type	Description
`Image`	Loaded image.

Source code in wildlife_datasets/datasets/datasets.py

def __getitem__(self, idx: int) -> Image:
    """Load an image with iloc `idx` with transforms `self.transform` and `self.img_load` applied.

    Args:
        idx (int): Index of the image.

    Returns:
        Loaded image.
    """

    img = self.get_image(idx)
    img = self.apply_segmentation(img, idx)
    if self.load_label:
        return img, self.df[self.col_label].iloc[idx]
    else:
        return img

`init(root=None, df=None, update_wrong_labels=True, transform=None, img_load='full', remove_unknown=False, remove_columns=False, check_files=True, load_label=False, col_path='path', col_label='identity', **kwargs)`

Initializes the class.

If df is specified, it copies it. Otherwise, it creates it by the create_catalogue method.

Parameters:

Name	Type	Description	Default
`root`	`Optional[str]`	Root directory for the data.	`None`
`df`	`Optional[DataFrame]`	A full dataframe of the data.	`None`
`update_wrong_labels`	`bool`	Whether `fix_labels` should be called.	`True`
`transform`	`Optional[Callable]`	Applied transform when loading the image.	`None`
`img_load`	`str`	Applied transform when loading the image.	`'full'`
`remove_unknown`	`bool`	Whether unknown identities should be removed.	`False`
`remove_columns`	`bool`	Whether constant columns are removed in `finalize_catalogue`.	`False`
`check_files`	`bool`	Whether files should be checks for existence in `finalize_catalogue`.	`True`
`load_label`	`bool`	Whether dataset[k] should return only image or also identity.	`False`
`col_path`	`str`	Column name containing image paths.	`'path'`
`col_label`	`str`	Column name containing individual animal names (labels).	`'identity'`

Source code in wildlife_datasets/datasets/datasets.py

def __init__(
        self, 
        root: Optional[str] = None,
        df: Optional[pd.DataFrame] = None,
        update_wrong_labels: bool = True,
        transform: Optional[Callable] = None,
        img_load: str = "full",
        remove_unknown: bool = False,
        remove_columns: bool = False,
        check_files: bool = True,
        load_label: bool = False,
        col_path: str = "path",
        col_label: str = "identity",            
        **kwargs) -> None:
    """Initializes the class.

    If `df` is specified, it copies it. Otherwise, it creates it
    by the `create_catalogue` method.

    Args:
        root (Optional[str], optional): Root directory for the data.
        df (Optional[pd.DataFrame], optional): A full dataframe of the data.
        update_wrong_labels (bool, optional): Whether `fix_labels` should be called.
        transform (Optional[Callable], optional): Applied transform when loading the image.
        img_load (str, optional): Applied transform when loading the image.
        remove_unknown (bool, optional): Whether unknown identities should be removed.
        remove_columns (bool, optional): Whether constant columns are removed in `finalize_catalogue`.
        check_files (bool, optional): Whether files should be checks for existence in `finalize_catalogue`.
        load_label (bool, optional): Whether dataset[k] should return only image or also identity.
        col_path (str, optional): Column name containing image paths.
        col_label (str, optional): Column name containing individual animal names (labels).
    """

    if not self.saved_to_system_folder and not root is None and not os.path.exists(root):
        raise Exception('root does not exist. You may have have mispelled it.')
    if self.outdated_dataset:
        print('This dataset is outdated. You may want to call a newer version such as %sv2.' % self.__class__.__name__)
    self.update_wrong_labels = update_wrong_labels
    self.root = root
    self.col_path = col_path
    self.col_label = col_label
    self.remove_columns = remove_columns
    self.check_files = check_files
    # If df is not provided, create it
    if df is None:
        df = self.create_catalogue(**kwargs)
    else:
        if not self.determined_by_df:
            print('This dataset is not determined by dataframe. But you construct it so.')
    if remove_unknown:
        df = df[df[self.col_label] != self.unknown_name]
    self.df = df.reset_index(drop=True)
    self.transform = transform
    self.img_load = img_load
    if self.img_load == "auto":
        if "segmentation" in self.df:
            self.img_load = "bbox_mask"
        elif "bbox" in self.df:
            self.img_load = "bbox"
        else:
            self.img_load = "full"
    self.load_label = load_label

`apply_segmentation(img, idx)`

Applies segmentation or bounding box when loading an image.

Parameters:

Name	Type	Description	Default
`img`	`Image`	Loaded image.	required
`idx`	`int`	Index of the image.	required

Returns:

Type	Description
`Image`	Loaded image.

Source code in wildlife_datasets/datasets/datasets.py

def apply_segmentation(self, img: Image, idx: int) -> Image:
    """Applies segmentation or bounding box when loading an image.

    Args:
        img (Image): Loaded image.
        idx (int): Index of the image.

    Returns:
        Loaded image.
    """

    # Prepare for segmentations        
    if self.img_load in ["full_mask", "full_hide", "bbox_mask", "bbox_hide"]:
        data = self.df.iloc[idx]
        if not ("segmentation" in data):
            raise ValueError(f"{self.img_load} selected but no segmentation found.")
        segmentation = data["segmentation"]
        if isinstance(segmentation, list) or isinstance(segmentation, np.ndarray):
            # Convert polygon to compressed RLE
            w, h = img.size
            rles = mask_coco.frPyObjects([segmentation], h, w)
            segmentation = mask_coco.merge(rles)
        elif isinstance(segmentation, dict) and (isinstance(segmentation['counts'], list) or isinstance(segmentation['counts'], np.ndarray)):            
            # Convert uncompressed RLE to compressed RLE
            h, w = segmentation['size']
            segmentation = mask_coco.frPyObjects(segmentation, h, w)
        elif isinstance(segmentation, str):
            # Load image mask and convert it to compressed RLE
            segmentation = np.asfortranarray(utils.load_image(os.path.join(self.root, segmentation)))
            if segmentation.ndim == 3:
                segmentation = segmentation[:,:,0]
            segmentation = mask_coco.encode(segmentation)
        elif not np.any(pd.isnull(segmentation)):
            raise Exception('Segmentation type not recognized')
    # Prepare for bounding boxes
    if self.img_load in ["bbox"]:
        data = self.df.iloc[idx]
        if not ("bbox" in data):
            raise ValueError(f"{self.img_load} selected but no bbox found.")
        if type(data["bbox"]) == str:
            bbox = json.loads(data["bbox"])
        else:
            bbox = data["bbox"]

    # Load full image as it is.
    if self.img_load == "full":
        img = img
    # Mask background using segmentation mask.
    elif self.img_load == "full_mask":
        if not np.any(pd.isnull(segmentation)):
            mask = mask_coco.decode(segmentation).astype("bool")
            img = Image.fromarray(img * mask[..., np.newaxis])
    # Hide object using segmentation mask
    elif self.img_load == "full_hide":
        if not np.any(pd.isnull(segmentation)):
            mask = mask_coco.decode(segmentation).astype("bool")
            img = Image.fromarray(img * ~mask[..., np.newaxis])
    # Crop to bounding box
    elif self.img_load == "bbox":
        if not np.any(pd.isnull(bbox)):
            img = img.crop((bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]))
    # Mask background using segmentation mask and crop to bounding box.
    elif self.img_load == "bbox_mask":
        if (not np.any(pd.isnull(segmentation))):
            mask = mask_coco.decode(segmentation).astype("bool")
            img = Image.fromarray(img * mask[..., np.newaxis])
            img = utils.crop_black(img)
    # Hide object using segmentation mask and crop to bounding box.
    elif self.img_load == "bbox_hide":
        if (not np.any(pd.isnull(segmentation))):
            mask = mask_coco.decode(segmentation).astype("bool")
            img = Image.fromarray(img * ~mask[..., np.newaxis])
            img = utils.crop_black(img)
    # Crop black background around images
    elif self.img_load == "crop_black":
        img = utils.crop_black(img)
    # Crop white background around images
    elif self.img_load == "crop_white":
        img = utils.crop_white(img)
    else:
        raise ValueError(f"Invalid img_load argument: {self.img_load}")

    if self.transform:
        img = self.transform(img)

    return img

`check_files_exist(col)`

Checks if paths in a given column exist.

Parameters:

Name	Type	Description	Default
`col`	`Series`	A column of a dataframe.	required

Source code in wildlife_datasets/datasets/datasets.py

def check_files_exist(self, col: pd.Series) -> None:
    """Checks if paths in a given column exist.

    Args:
        col (pd.Series): A column of a dataframe.
    """

    for path in col:
        if type(path) == str and not os.path.exists(os.path.join(self.root, path)):
            raise(Exception('Path does not exist:' + os.path.join(self.root, path)))

`check_files_names(col)`

Checks if paths contain .

Parameters:

Name	Type	Description	Default
`col`	`Series`	A column of a dataframe.	required

Source code in wildlife_datasets/datasets/datasets.py

def check_files_names(self, col: pd.Series) -> None:
    """Checks if paths contain .

    Args:
        col (pd.Series): A column of a dataframe.
    """

    for path in col:
        try:
            path.encode("iso-8859-1")
        except UnicodeEncodeError:
            raise(Exception('Characters in path may cause problems. Please use only ISO-8859-1 characters: ' + os.path.join(path)))

`check_required_columns(df)`

Check if all required columns are present.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A full dataframe of the data.	required

Source code in wildlife_datasets/datasets/datasets.py

def check_required_columns(self, df: pd.DataFrame) -> None:
    """Check if all required columns are present.

    Args:
        df (pd.DataFrame): A full dataframe of the data.
    """

    for col_name in ['image_id', self.col_label, self.col_path]:
        if col_name not in df.columns:
            raise(Exception('Column %s must be in the dataframe columns.' % col_name))

`check_types_column(col, col_name, allowed_types)`

Checks if the column col is in the format allowed_types.

Parameters:

Name	Type	Description	Default
`col`	`Series`	Column to be checked.	required
`col_name`	`str`	Column name used only for raising exceptions.	required
`allowed_types`	`List[str]`	List of strings with allowed values: `int` (all values must be integers), `str` (strings), `list` (lists), `list_numeric` (lists with numeric values), `date` (dates as tested by `pd.to_datetime`).	required

Source code in wildlife_datasets/datasets/datasets.py

def check_types_column(self, col: pd.Series, col_name: str, allowed_types: List[str]) -> None:
    """Checks if the column `col` is in the format `allowed_types`.

    Args:
        col (pd.Series): Column to be checked.
        col_name (str): Column name used only for raising exceptions.
        allowed_types (List[str]): List of strings with allowed values:
            `int` (all values must be integers),
            `str` (strings),
            `list` (lists),
            `list_numeric` (lists with numeric values),
            `date` (dates as tested by `pd.to_datetime`).
    """

    if 'int' in allowed_types and pd.api.types.is_integer_dtype(col):
        return None
    if 'str' in allowed_types and pd.api.types.is_string_dtype(col):
        return None
    if 'list' in allowed_types and pd.api.types.is_list_like(col):
        check = True
        for val in col:
            if not pd.api.types.is_list_like(val):
                check = False
                break
        if check:                
            return None        
    if 'list_numeric' in allowed_types and pd.api.types.is_list_like(col):
        check = True
        for val in col:            
            if not pd.api.types.is_list_like(val) and not pd.api.types.is_numeric_dtype(pd.Series(val)):
                check = False
                break
        if check:                
            return None
    if 'date' in allowed_types:
        try:
            pd.to_datetime(col)
            return None
        except:
            pass
    raise(Exception('Column %s has wrong type. Allowed types = %s' % (col_name, str(allowed_types))))

`check_types_columns(df)`

Checks if columns are in correct formats.

The format are specified in requirements, which is list of tuples. The first value is the name of the column and the second value is a list of formats. The column must be at least one of the formats.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A full dataframe of the data.	required

Source code in wildlife_datasets/datasets/datasets.py

def check_types_columns(self, df: pd.DataFrame) -> None:
    """Checks if columns are in correct formats.

    The format are specified in `requirements`, which is list
    of tuples. The first value is the name of the column
    and the second value is a list of formats. The column
    must be at least one of the formats.

    Args:
        df (pd.DataFrame): A full dataframe of the data.
    """

    requirements = [
        ('image_id', ['int', 'str']),
        (self.col_label, ['int', 'str']),
        (self.col_path, ['str']),
        ('bbox', ['list_numeric']),
        ('date', ['date']),
        ('keypoints', ['list_numeric']),
        ('position', ['str']),
        ('species', ['str', 'list']),
        ('video', ['int']),
    ]
    # Verify if the columns are in correct formats
    for col_name, allowed_types in requirements:
        if col_name in df.columns:
            # Remove empty values to be sure
            col = df[col_name][~df[col_name].isnull()]
            if len(col) > 0:
                self.check_types_column(col, col_name, allowed_types)

`check_unique_id(df)`

Checks if values in the id column are unique.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A full dataframe of the data.	required

Source code in wildlife_datasets/datasets/datasets.py

def check_unique_id(self, df: pd.DataFrame) -> None:
    """Checks if values in the id column are unique.

    Args:
        df (pd.DataFrame): A full dataframe of the data.
    """

    if len(df['image_id'].unique()) != len(df):
        raise(Exception('Image ID not unique.'))

`create_catalogue()`

Creates the dataframe.

Raises:

Type	Description
`NotImplementedError`	Needs to be implemented by subclasses.

Source code in wildlife_datasets/datasets/datasets.py

def create_catalogue(self):
    """Creates the dataframe.

    Raises:
        NotImplementedError: Needs to be implemented by subclasses.
    """

    raise NotImplementedError('Needs to be implemented by subclasses.')

`display_name()` `classmethod`

Returns name of the dataset without the v2 ending.

Returns:

Type	Description
`str`	Name of the dataset.

Source code in wildlife_datasets/datasets/datasets.py

@classmethod
def display_name(cls) -> str:
    """Returns name of the dataset without the v2 ending.

    Returns:
        Name of the dataset.
    """

    cls_parent = cls.__bases__[0]
    while cls_parent != object and cls_parent.outdated_dataset:
        cls = cls_parent
        cls_parent = cls.__bases__[0]            
    return cls.__name__

`download(root, force=False, **kwargs)` `classmethod`

Downloads the data. Wrapper around cls._download.

Parameters:

Name	Type	Description	Default
`root`	`str`	Where the data should be stored.	required
`force`	`bool`	It the root exists, whether it should be overwritten.	`False`

Source code in wildlife_datasets/datasets/datasets.py

@classmethod
def download(
        cls,
        root: str,
        force: bool = False,
        **kwargs
        ) -> None:
    """Downloads the data. Wrapper around `cls._download`.

    Args:
        root (str): Where the data should be stored.
        force (bool, optional): It the root exists, whether it should be overwritten.
    """

    dataset_name = cls.__name__
    mark_file_name = os.path.join(root, cls.download_mark_name)

    already_downloaded = os.path.exists(mark_file_name)
    if cls.saved_to_system_folder:
        cls._download(**kwargs)
    elif already_downloaded and not force:
        print('DATASET %s: DOWNLOADING STARTED.' % dataset_name)            
        print(cls.download_warning)
    else:
        if os.path.exists(mark_file_name):
            os.remove(mark_file_name)
        with utils.data_directory(root):
            cls._download(**kwargs)
        open(mark_file_name, 'a').close()
        if hasattr(cls, 'summary') and 'licenses_url' in cls.summary:
            with open(os.path.join(root, cls.license_file_name), 'w') as file:
                file.write(cls.summary['licenses_url'])

`extract(root, **kwargs)` `classmethod`

Extract the data. Wrapper around cls._extract.

Parameters:

Name	Type	Description	Default
`root`	`str`	Where the data should be stored.	required

Source code in wildlife_datasets/datasets/datasets.py

@classmethod    
def extract(cls, root: str, **kwargs) -> None:
    """Extract the data. Wrapper around `cls._extract`.

    Args:
        root (str): Where the data should be stored.
    """

    if cls.saved_to_system_folder:
        cls._extract(**kwargs)
    else:
        with utils.data_directory(root):
            cls._extract(**kwargs)
        mark_file_name = os.path.join(root, cls.download_mark_name)
        open(mark_file_name, 'a').close()

`finalize_catalogue(df)`

Reorders the dataframe and check file paths.

Reorders the columns and removes constant columns. Checks if columns are in correct formats. Checks if ids are unique and if all files exist.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A full dataframe of the data.	required

Returns:

Type	Description
`DataFrame`	A full dataframe of the data, slightly modified.

Source code in wildlife_datasets/datasets/datasets.py

def finalize_catalogue(
        self,
        df: pd.DataFrame,
        ) -> pd.DataFrame:
    """Reorders the dataframe and check file paths.

    Reorders the columns and removes constant columns.
    Checks if columns are in correct formats.
    Checks if ids are unique and if all files exist.

    Args:
        df (pd.DataFrame): A full dataframe of the data.

    Returns:
        A full dataframe of the data, slightly modified.
    """

    if self.update_wrong_labels:
        df = self.fix_labels(df)
    self.rename_column(df, 'path', self.col_path)
    self.rename_column(df, 'identity', self.col_label)
    self.check_required_columns(df)
    self.check_types_columns(df)
    df = self.reorder_df(df)
    if self.remove_columns:
        df = self.remove_constant_columns(df)
    self.check_unique_id(df)
    if self.check_files:
        self.check_files_exist(df[self.col_path])
        self.check_files_names(df[self.col_path])
        if 'segmentation' in df.columns:
            self.check_files_exist(df['segmentation'])
    return df

`fix_labels(df)`

Fixes labels in dataframe.

Automatically called in finalize_catalogue.

Source code in wildlife_datasets/datasets/datasets.py

def fix_labels(self, df: pd.DataFrame) -> pd.DataFrame:
    """Fixes labels in dataframe.

    Automatically called in `finalize_catalogue`.                
    """

    return df

`fix_labels_remove_identity(df, identities_to_remove, col='identity')`

Removes all instances of identities.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A full dataframe of the data.	required
`identities_to_remove`	`List`	List of identities to remove.	required
`col`	`str`	Column to remove from.	`'identity'`

Returns:

Type	Description
`DataFrame`	A full dataframe of the data.

Source code in wildlife_datasets/datasets/datasets.py

def fix_labels_remove_identity(
        self,
        df: pd.DataFrame,
        identities_to_remove: List,
        col: str = 'identity'
        ) -> pd.DataFrame:
    """Removes all instances of identities.

    Args:
        df (pd.DataFrame): A full dataframe of the data.
        identities_to_remove (List): List of identities to remove.
        col (str, optional): Column to remove from.

    Returns:
        A full dataframe of the data.
    """
    idx_remove = [identity in identities_to_remove for identity in df[col]]
    return df[~np.array(idx_remove)]

`fix_labels_replace_identity(df, replace_identity, col='identity')`

Replaces all instances of identities.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A full dataframe of the data.	required
`replace_identity`	`List[Tuple]`	List of (old_identity, new_identity)	required
`col`	`str`	Column to replace in.	`'identity'`

Returns:

Type	Description
`DataFrame`	A full dataframe of the data.

Source code in wildlife_datasets/datasets/datasets.py

def fix_labels_replace_identity(
        self,
        df: pd.DataFrame,
        replace_identity: List[Tuple],
        col: str = 'identity'
        ) -> pd.DataFrame:
    """Replaces all instances of identities.

    Args:
        df (pd.DataFrame): A full dataframe of the data.
        replace_identity (List[Tuple]): List of (old_identity, new_identity)
        col (str, optional): Column to replace in.

    Returns:
        A full dataframe of the data.
    """
    for old_identity, new_identity in replace_identity:
        df[col] = df[col].replace({old_identity: new_identity})
    return df

`fix_labels_replace_images(df, replace_identity, col='identity')`

Replaces specified images with specified identities.

It looks for a subset of image_name in df[self.col_path]. It may cause problems with os.path.sep.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A full dataframe of the data.	required
`replace_identity`	`List[Tuple]`	List of (image_name, old_identity, new_identity).	required
`col`	`str`	Column to replace in.	`'identity'`

Returns:

Type	Description
`DataFrame`	A full dataframe of the data.

Source code in wildlife_datasets/datasets/datasets.py

def fix_labels_replace_images(
        self,
        df: pd.DataFrame,
        replace_identity: List[Tuple],
        col: str = 'identity'
        ) -> pd.DataFrame:
    """Replaces specified images with specified identities.

    It looks for a subset of image_name in df[self.col_path].
    It may cause problems with `os.path.sep`.

    Args:
        df (pd.DataFrame): A full dataframe of the data.
        replace_identity (List[Tuple]): List of (image_name, old_identity, new_identity).
        col (str, optional): Column to replace in.

    Returns:
        A full dataframe of the data.
    """
    for image_name, old_identity, new_identity in replace_identity:
        n_replaced = 0
        for index, df_row in df.iterrows():
            # Check that there is a image with the required name and identity 
            if image_name in df_row[self.col_path] and old_identity == df_row[col]:
                df.loc[index, col] = new_identity
                n_replaced += 1
        if n_replaced == 0:
            print('File name %s with identity %s was not found.' % (image_name, str(old_identity)))
        elif n_replaced > 1:
            print('File name %s with identity %s was found multiple times.' % (image_name, str(old_identity)))
    return df

`get_data(root, force=False, **kwargs)` `classmethod`

Downloads and extracts the data. Wrapper around cls._download and cls._extract.

Parameters:

Name	Type	Description	Default
`root`	`str`	Where the data should be stored.	required
`force`	`bool`	It the root exists, whether it should be overwritten.	`False`

Source code in wildlife_datasets/datasets/datasets.py

@classmethod
def get_data(
        cls,
        root: str,
        force: bool = False,
        **kwargs
        ) -> None:
    """Downloads and extracts the data. Wrapper around `cls._download` and `cls._extract.`

    Args:
        root (str): Where the data should be stored.
        force (bool, optional): It the root exists, whether it should be overwritten.
    """

    dataset_name = cls.__name__
    mark_file_name = os.path.join(root, cls.download_mark_name)

    already_downloaded = os.path.exists(mark_file_name)
    if not cls.saved_to_system_folder and already_downloaded and not force:
        print('DATASET %s: DOWNLOADING STARTED.' % dataset_name)
        print(cls.download_warning)
    else:
        print('DATASET %s: DOWNLOADING STARTED.' % dataset_name)
        cls.download(root, force=force, **kwargs)
        print('DATASET %s: EXTRACTING STARTED.' % dataset_name)
        cls.extract(root,  **kwargs)
        print('DATASET %s: FINISHED.\n' % dataset_name)

`get_image(idx)`

Load an image with iloc idx.

Parameters:

Name	Type	Description	Default
`idx`	`int`	Index of the image.	required

Returns:

Type	Description
`Image`	Loaded image.

Source code in wildlife_datasets/datasets/datasets.py

def get_image(self, idx: int) -> Image:
    """Load an image with iloc `idx`.

    Args:
        idx (int): Index of the image.

    Returns:
        Loaded image.
    """

    data = self.df.iloc[idx]
    if self.root:
        img_path = os.path.join(self.root, data[self.col_path])
    else:
        img_path = data[self.col_path]
    img = self.load_image(img_path)
    return img

`get_subset(idx)`

Returns a subset of the class.

Parameters:

Name	Type	Description	Default
`idx`	`Union[List[int], List[bool]]`	Indices in the dataframe of the subset.	required

Returns:

Type	Description
`WildlifeDataset`	The subset class.

Source code in wildlife_datasets/datasets/datasets.py

def get_subset(self, idx: Union[List[int], List[bool]]) -> WildlifeDataset:
    """Returns a subset of the class.

    Args:
        idx (Union[List[int], List[bool]]): Indices in the dataframe of the subset.

    Returns:
        The subset class.
    """

    dataset = deepcopy(self)
    if len(self) == len(idx):
        dataset.df = dataset.df[idx].reset_index(drop=True)
    else:
        dataset.df = dataset.df.loc[idx].reset_index(drop=True)
    return dataset

`load_image(path)`

Load an image with path.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the image.	required

Returns:

Type	Description
`Image`	Loaded image.

Source code in wildlife_datasets/datasets/datasets.py

def load_image(self, path: str) -> Image:
    """Load an image with `path`.

    Args:
        path (str): Path to the image.

    Returns:
        Loaded image.
    """

    return utils.load_image(path)

`plot_grid(n_rows=5, n_cols=8, offset=10, img_min=100, rotate=True, header_cols=None, idx=None, background_color=(0, 0, 0), **kwargs)`

Plots a grid of size (n_rows, n_cols) with images from the dataframe.

Parameters:

Name	Type	Description	Default
`n_rows`	`int`	The number of rows in the grid.	`5`
`n_cols`	`int`	The number of columns in the grid.	`8`
`offset`	`float`	The offset between images.	`10`
`img_min`	`float`	The minimal size of the plotted images.	`100`
`rotate`	`bool`	Rotates the images to have the same orientation.	`True`
`header_cols`	`Optional[List[str]]`	List of headers for each column.	`None`
`idx`	`Optional[Union[List[bool], List[int]]]`	List of indices to plot. None plots random images. Index -1 plots an empty image.	`None`
`background_color`	`Tuple[int]`	Background color of the grid.	`(0, 0, 0)`

Source code in wildlife_datasets/datasets/datasets.py

def plot_grid(
        self,
        n_rows: int = 5,
        n_cols: int = 8,
        offset: float = 10,
        img_min: float = 100,
        rotate: bool = True,
        header_cols: Optional[List[str]] = None,
        idx: Optional[Union[List[bool],List[int]]] = None,
        background_color: Tuple[int] = (0, 0, 0),
        **kwargs
        ) -> None:
    """Plots a grid of size (n_rows, n_cols) with images from the dataframe.

    Args:
        n_rows (int, optional): The number of rows in the grid.
        n_cols (int, optional): The number of columns in the grid.
        offset (float, optional): The offset between images.
        img_min (float, optional): The minimal size of the plotted images.
        rotate (bool, optional): Rotates the images to have the same orientation.
        header_cols (Optional[List[str]], optional): List of headers for each column.
        idx (Optional[Union[List[bool],List[int]]], optional): List of indices to plot. None plots random images. Index -1 plots an empty image.
        background_color (Tuple[int], optional): Background color of the grid.
    """

    if len(self.df) == 0:
        return None

    # Select indices of images to be plotted
    if idx is None:
        n = min(len(self.df), n_rows*n_cols)
        idx = np.random.permutation(len(self.df))[:n]
    else:
        if isinstance(idx, pd.Series):
            idx = idx.values
        if isinstance(idx[0], (bool, np.bool_)):
            idx = np.where(idx)[0]
        n = min(np.array(idx).size, n_rows*n_cols)
        idx = np.matrix.flatten(np.array(idx))[:n]

    # Load images and compute their ratio
    ratios = []
    ims = []
    for k in idx:
        if k >= 0:
            # Load the image with index k
            if self.load_label:
                im, _ = self[k]
            else:
                im = self[k]
            ims.append(im)
            ratios.append(im.size[0] / im.size[1])
        else:
            # Load a black image
            ims.append(Image.fromarray(np.zeros((2, 2), dtype = "uint8")))

    # Safeguard when all indices are -1
    if len(ratios) == 0:
        return None

    # Get the size of the images after being resized
    ratio = np.median(ratios)
    if ratio > 1:    
        img_w, img_h = int(img_min*ratio), int(img_min)
    else:
        img_w, img_h = int(img_min), int(img_min/ratio)

    # Compute height offset if headers are present
    if header_cols is not None:
        offset_h = 30
        if len(header_cols) != n_cols:
            raise(Exception("Length of header_cols must be the same as n_cols."))
    else:
        offset_h = 0

    # Create an empty image grid
    im_grid = Image.new('RGB', (n_cols*img_w + (n_cols-1)*offset, offset_h + n_rows*img_h + (n_rows-1)*offset), background_color)

    # Fill the grid image by image
    pos_y = offset_h
    for i in range(n_rows):
        row_h = 0
        for j in range(n_cols):
            k = (n_cols)*i + j
            if k < n:
                # Possibly rotate the image
                im = ims[k]
                if rotate and ((ratio > 1 and im.size[0] < im.size[1]) or (ratio < 1 and im.size[0] > im.size[1])):
                    im = im.transpose(Image.Transpose.ROTATE_90)

                # Rescale the image
                im.thumbnail((img_w,img_h))
                row_h = max(row_h, im.size[1])

                # Place the image on the grid
                pos_x = j*img_w + j*offset
                im_grid.paste(im, (pos_x,pos_y))
        if row_h > 0:
            pos_y += row_h + offset
    im_grid = im_grid.crop((0, 0, im_grid.size[0], pos_y-offset))

    # Plot the image and add column headers if present
    fig = plt.figure()
    fig.patch.set_visible(False)
    ax = fig.add_subplot(111)
    plt.axis('off')
    plt.imshow(im_grid)
    if header_cols is not None:
        color = kwargs.pop('color', 'white')
        ha = kwargs.pop('ha', 'center')
        va = kwargs.pop('va', 'center')
        for i, header in enumerate(header_cols):
            pos_x = (i+0.5)*img_w + i*offset
            pos_y = offset_h/2
            plt.text(pos_x, pos_y, str(header), color=color, ha=ha, va=va, **kwargs)
    return fig

`remove_constant_columns(df)`

Removes columns with a single unique value.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A full dataframe of the data.	required

Returns:

Type	Description
`DataFrame`	A full dataframe of the data, slightly modified.

Source code in wildlife_datasets/datasets/datasets.py

def remove_constant_columns(self, df: pd.DataFrame) -> pd.DataFrame:
    """Removes columns with a single unique value.

    Args:
        df (pd.DataFrame): A full dataframe of the data.

    Returns:
        A full dataframe of the data, slightly modified.
    """ 

    for df_name in list(df.columns):
        if df[df_name].astype('str').nunique() == 1:
            df = df.drop([df_name], axis=1)
    return df

`reorder_df(df)`

Reorders rows and columns in the dataframe.

Rows are sorted based on id. Columns are reorder based on the default_order list.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A full dataframe of the data.	required

Returns:

Type	Description
`DataFrame`	A full dataframe of the data, slightly modified.

Source code in wildlife_datasets/datasets/datasets.py

def reorder_df(self, df: pd.DataFrame) -> pd.DataFrame:
    """Reorders rows and columns in the dataframe.

    Rows are sorted based on id.
    Columns are reorder based on the `default_order` list.

    Args:
        df (pd.DataFrame): A full dataframe of the data.

    Returns:
        A full dataframe of the data, slightly modified.
    """

    default_order = ['image_id', self.col_label, self.col_path, 'bbox', 'date', 'keypoints', 'orientation', 'segmentation', 'species']
    df_names = list(df.columns)
    col_names = []
    for name in default_order:
        if name in df_names:
            col_names.append(name)
    for name in df_names:
        if name not in default_order:
            col_names.append(name)

    df = df.sort_values('image_id').reset_index(drop=True)
    return df.reindex(columns=col_names)

Metadata

Class for storing metadata.

Attributes:

Name	Type	Description
`df`	`DataFrame`	A dataframe of the metadata.

Source code in wildlife_datasets/datasets/summary.py

class Summary():
    """Class for storing metadata.

    Attributes:
      df (pd.DataFrame): A dataframe of the metadata.
    """

    def __init__(self, path: str):
        """Loads the metadata from a csv file into a dataframe.

        The `animals` column is converted to a list.

        Args:
            path (str): Path of the csv file.
        """

        df = pd.read_csv(path, index_col='name')
        if 'animals' in df.columns:
            df.loc[df['animals'].isnull(), 'animals'] = '{}'
            df['animals'] = df['animals'].apply(lambda x: eval(x))
        self.df = df

    def __getitem__(self, item):
        return self.df.loc[item].dropna().to_dict()

`init(path)`

Loads the metadata from a csv file into a dataframe.

The animals column is converted to a list.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path of the csv file.	required

Source code in wildlife_datasets/datasets/summary.py

def __init__(self, path: str):
    """Loads the metadata from a csv file into a dataframe.

    The `animals` column is converted to a list.

    Args:
        path (str): Path of the csv file.
    """

    df = pd.read_csv(path, index_col='name')
    if 'animals' in df.columns:
        df.loc[df['animals'].isnull(), 'animals'] = '{}'
        df['animals'] = df['animals'].apply(lambda x: eval(x))
    self.df = df

Utils

`bbox_segmentation(bbox)`

Convert bounding box to segmentation.

Parameters:

Name	Type	Description	Default
`bbox`	`List[float]`	Bounding box in the form [x, y, w, h].	required

Returns:

Type	Description
`List[float]`	Segmentation mask in the form [x1, y1, x2, y2, ...].

Source code in wildlife_datasets/datasets/utils.py

def bbox_segmentation(bbox: List[float]) -> List[float]:
    """Convert bounding box to segmentation.

    Args:
        bbox (List[float]): Bounding box in the form [x, y, w, h].

    Returns:
        Segmentation mask in the form [x1, y1, x2, y2, ...].
    """

    return [bbox[0], bbox[1], bbox[0]+bbox[2], bbox[1], bbox[0]+bbox[2], bbox[1]+bbox[3], bbox[0], bbox[1]+bbox[3], bbox[0], bbox[1]]

`create_id(string_col)`

Creates unique ids from string based on MD5 hash.

Parameters:

Name	Type	Description	Default
`string_col`	`Series`	List of ids.	required

Returns:

Type	Description
`Series`	List of encoded ids.

Source code in wildlife_datasets/datasets/utils.py

def create_id(string_col: pd.Series) -> pd.Series:
    """Creates unique ids from string based on MD5 hash.

    Args:
        string_col (pd.Series): List of ids.

    Returns:
        List of encoded ids.
    """

    entity_id = string_col.apply(lambda x: hashlib.md5(x.encode()).hexdigest()[:16])
    assert len(entity_id.unique()) == len(entity_id)
    return entity_id

`crop_black(img)`

Crops black borders from an image.

Parameters:

Name	Type	Description	Default
`img`	`Image`	Image to be cropped.	required

Returns:

Type	Description
`Image`	Cropped image.

Source code in wildlife_datasets/datasets/utils.py

def crop_black(img: Image) -> Image:
    """Crops black borders from an image.    

    Args:
        img (Image): Image to be cropped.

    Returns:
        Cropped image.
    """

    y_nonzero, x_nonzero, _ = np.nonzero(img)
    return img.crop(
        (
            np.min(x_nonzero),
            np.min(y_nonzero),
            np.max(x_nonzero),
            np.max(y_nonzero),
        )
    )

`crop_white(img)`

Crops white borders from an image.

Parameters:

Name	Type	Description	Default
`img`	`Image`	Image to be cropped.	required

Returns:

Type	Description
`Image`	Cropped image.

Source code in wildlife_datasets/datasets/utils.py

def crop_white(img: Image) -> Image:
    """Crops white borders from an image.    

    Args:
        img (Image): Image to be cropped.

    Returns:
        Cropped image.
    """

    y_nonzero, x_nonzero, _ = np.nonzero(ImageOps.invert(img))
    return img.crop(
        (
            np.min(x_nonzero),
            np.min(y_nonzero),
            np.max(x_nonzero),
            np.max(y_nonzero),
        )
    )

`data_directory(dir)`

Changes context such that data directory is used as current work directory. Data directory is created if it does not exist.

Source code in wildlife_datasets/datasets/utils.py

@contextmanager
def data_directory(dir):
    '''
    Changes context such that data directory is used as current work directory.
    Data directory is created if it does not exist.
    '''
    current_dir = os.getcwd()
    if not os.path.exists(dir):
        os.makedirs(dir)
    os.chdir(dir)
    try:
        yield
    finally:
        os.chdir(current_dir)

`find_images(root, img_extensions=('.png', '.jpg', '.jpeg'))`

Finds all image files in folder and subfolders.

Parameters:

Name	Type	Description	Default
`root`	`str`	The root folder where to look for images.	required
`img_extensions`	`Tuple[str, ...]`	Image extensions to look for, by default ('.png', '.jpg', '.jpeg').	`('.png', '.jpg', '.jpeg')`

Returns:

Type	Description
`DataFrame`	Dataframe of relative paths of the images.

Source code in wildlife_datasets/datasets/utils.py

def find_images(
        root: str,
        img_extensions: Tuple[str, ...] = ('.png', '.jpg', '.jpeg')
        ) -> pd.DataFrame:
    """Finds all image files in folder and subfolders.

    Args:
        root (str): The root folder where to look for images.
        img_extensions (Tuple[str, ...], optional): Image extensions to look for, by default ('.png', '.jpg', '.jpeg').

    Returns:
        Dataframe of relative paths of the images.
    """

    data = [] 
    for path, directories, files in os.walk(root):
        for file in files:
            if file.lower().endswith(tuple(img_extensions)):
                data.append({'path': os.path.relpath(path, start=root), 'file': file})
    return pd.DataFrame(data)

`is_annotation_bbox(segmentation, bbox, tol=0)`

Checks whether segmentation is bounding box.

Parameters:

Name	Type	Description	Default
`segmentation`	`List[float]`	Segmentation mask in the form [x1, y1, x2, y2, ...].	required
`bbox`	`List[float]`	Bounding box in the form [x, y, w, h].	required
`tol`	`float`	Tolerance for difference.	`0`

Returns:

Type	Description
`bool`	True if segmentation is bounding box within tolerance.

Source code in wildlife_datasets/datasets/utils.py

def is_annotation_bbox(
        segmentation: List[float],
        bbox: List[float],
        tol: float = 0
        ) -> bool:
    """Checks whether segmentation is bounding box.

    Args:
        segmentation (List[float]): Segmentation mask in the form [x1, y1, x2, y2, ...].
        bbox (List[float]): Bounding box in the form [x, y, w, h].
        tol (float, optional): Tolerance for difference.

    Returns:
        True if segmentation is bounding box within tolerance.
    """

    bbox_seg = bbox_segmentation(bbox)
    if len(segmentation) == len(bbox_seg):
        for x, y in zip(segmentation, bbox_seg):
            if abs(x-y) > tol:
                return False
    else:
        return False
    return True

`load_image(path, max_size=None)`

Loads an image.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path of the image.	required
`max_size`	`int`	Maximal size of the image or None (no restriction).	`None`

Returns:

Type	Description
`Image`	Loaded image.

Source code in wildlife_datasets/datasets/utils.py

def load_image(path: str, max_size: int = None) -> Image:
    """Loads an image.

    Args:
        path (str): Path of the image.
        max_size (int, optional): Maximal size of the image or None (no restriction).

    Returns:
        Loaded image.
    """

    # We load it with OpenCV because PIL does not apply metadata.
    img = cv2.imread(path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
    if max_size is not None:
        w, h = img.size
        if max(w, h) > max_size:
            c = max_size / max(w, h)
            img = img.resize((int(c*w), int(c*h)))
    return img

`segmentation_bbox(segmentation)`

Convert segmentation to bounding box.

Parameters:

Name	Type	Description	Default
`segmentation`	`List[float]`	Segmentation mask in the form [x1, y1, x2, y2, ...].	required

Returns:

Type	Description
`List[float]`	Bounding box in the form [x, y, w, h].

Source code in wildlife_datasets/datasets/utils.py

def segmentation_bbox(segmentation: List[float]) -> List[float]:
    """Convert segmentation to bounding box.

    Args:
        segmentation (List[float]): Segmentation mask in the form [x1, y1, x2, y2, ...].

    Returns:
        Bounding box in the form [x, y, w, h].
    """

    x = segmentation[0::2]
    y = segmentation[1::2]
    x_min = np.min(x)
    x_max = np.max(x)
    y_min = np.min(y)
    y_max = np.max(y)
    return [x_min, y_min, x_max-x_min, y_max-y_min]

Reference datasets

WildlifeDataset

__getitem__(idx)

__init__(root=None, df=None, update_wrong_labels=True, transform=None, img_load='full', remove_unknown=False, remove_columns=False, check_files=True, load_label=False, col_path='path', col_label='identity', **kwargs)

apply_segmentation(img, idx)

check_files_exist(col)

check_files_names(col)

check_required_columns(df)

check_types_column(col, col_name, allowed_types)

check_types_columns(df)

check_unique_id(df)

create_catalogue()

display_name() classmethod

download(root, force=False, **kwargs) classmethod

extract(root, **kwargs) classmethod

finalize_catalogue(df)

fix_labels(df)

fix_labels_remove_identity(df, identities_to_remove, col='identity')

fix_labels_replace_identity(df, replace_identity, col='identity')

fix_labels_replace_images(df, replace_identity, col='identity')

get_data(root, force=False, **kwargs) classmethod

get_image(idx)

get_subset(idx)

load_image(path)

plot_grid(n_rows=5, n_cols=8, offset=10, img_min=100, rotate=True, header_cols=None, idx=None, background_color=(0, 0, 0), **kwargs)

remove_constant_columns(df)

reorder_df(df)

Metadata

__init__(path)

Utils

bbox_segmentation(bbox)

create_id(string_col)

crop_black(img)

crop_white(img)

data_directory(dir)

find_images(root, img_extensions=('.png', '.jpg', '.jpeg'))

is_annotation_bbox(segmentation, bbox, tol=0)

load_image(path, max_size=None)

segmentation_bbox(segmentation)

`getitem(idx)`

`init(root=None, df=None, update_wrong_labels=True, transform=None, img_load='full', remove_unknown=False, remove_columns=False, check_files=True, load_label=False, col_path='path', col_label='identity', **kwargs)`

`apply_segmentation(img, idx)`

`check_files_exist(col)`

`check_files_names(col)`

`check_required_columns(df)`

`check_types_column(col, col_name, allowed_types)`

`check_types_columns(df)`

`check_unique_id(df)`

`create_catalogue()`

`display_name()` `classmethod`

`download(root, force=False, **kwargs)` `classmethod`

`extract(root, **kwargs)` `classmethod`

`finalize_catalogue(df)`

`fix_labels(df)`

`fix_labels_remove_identity(df, identities_to_remove, col='identity')`

`fix_labels_replace_identity(df, replace_identity, col='identity')`

`fix_labels_replace_images(df, replace_identity, col='identity')`

`get_data(root, force=False, **kwargs)` `classmethod`

`get_image(idx)`

`get_subset(idx)`

`load_image(path)`

`plot_grid(n_rows=5, n_cols=8, offset=10, img_min=100, rotate=True, header_cols=None, idx=None, background_color=(0, 0, 0), **kwargs)`

`remove_constant_columns(df)`

`reorder_df(df)`

`init(path)`

`bbox_segmentation(bbox)`

`create_id(string_col)`

`crop_black(img)`

`crop_white(img)`

`data_directory(dir)`

`find_images(root, img_extensions=('.png', '.jpg', '.jpeg'))`

`is_annotation_bbox(segmentation, bbox, tol=0)`

`load_image(path, max_size=None)`

`segmentation_bbox(segmentation)`