Reference utils

This file describes methods associated with dataset analysis and loading.

Analysis

`compute_span(df, col_label='identity')`

Compute the time span of the dataset.

The span is defined as the latest time minus the earliest time of image taken. The times are computed separately for each individual.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A full dataframe of the data.	required
`col_label`	`str`	Column name containing individual animal names (labels).	`'identity'`

Returns:

Type	Description
`float`	The span of the dataset in seconds.

Source code in wildlife_datasets/analysis/statistics.py

def compute_span(df: pd.DataFrame, col_label: str = 'identity') -> float:
    """Compute the time span of the dataset.

    The span is defined as the latest time minus the earliest time of image taken.
    The times are computed separately for each individual.

    Args:
        df (pd.DataFrame): A full dataframe of the data.
        col_label (str, optional): Column name containing individual animal names (labels).

    Returns:
        The span of the dataset in seconds.
    """

    # Convert the dates into timedelta
    df = df.loc[~df['date'].isnull()]
    dates = pd.to_datetime(df['date']).to_numpy()

    # Find the maximal span across individuals
    identities = df[col_label].unique()
    span_seconds = -np.inf
    for identity in identities:
        idx = df[col_label] == identity
        span_seconds = np.maximum(span_seconds, (max(dates[idx]) - min(dates[idx])) / np.timedelta64(1, 's'))
    return span_seconds

`display_statistics(df, unknown_name='', col_label='identity')`

Prints statistics about the dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A full dataframe of the data.	required
`unknown_name`	`str`	Name of the unknown class.	`''`
`col_label`	`str`	Column name containing individual animal names (labels).	`'identity'`

Source code in wildlife_datasets/analysis/statistics.py

def display_statistics(
        df: pd.DataFrame,
        unknown_name: str = '',
        col_label: str = 'identity',
        ) -> None:
    """Prints statistics about the dataframe.

    Args:
        df (pd.DataFrame): A full dataframe of the data.
        unknown_name (str, optional): Name of the unknown class.
        col_label (str, optional): Column name containing individual animal names (labels).
    """

    # Remove the unknown identities
    df_red = df.loc[df[col_label] != unknown_name, col_label]
    df_red.value_counts().reset_index(drop=True).plot(xlabel='identities', ylabel='counts')

    # Compute the total number of identities
    if unknown_name in list(df[col_label].unique()):
        n_identity = len(df.identity.unique()) - 1
    else:
        n_identity = len(df.identity.unique())    
    n_one = len(df.groupby(col_label).filter(lambda x : len(x) == 1))
    n_unidentified = sum(df[col_label] == unknown_name)

    # Print general statistics
    print(f"Number of identitites            {n_identity}")
    print(f"Number of all animals            {len(df)}")
    print(f"Number of animals with one image {n_one}")
    print(f"Number of unidentified animals   {n_unidentified}")

    # Print statistics about video if present
    if 'video' in df.columns:
        print(f"Number of videos                 {len(df[[col_label, 'video']].drop_duplicates())}")

    # Print statistics about time span if present
    if 'date' in df.columns:
        span_years = compute_span(df, col_label=col_label) / (60*60*24*365.25)
        if span_years > 1:
            print(f"Images span                      %1.1f years" % (span_years))
        elif span_years / 12 > 1:
            print(f"Images span                      %1.1f months" % (span_years * 12))
        else:
            print(f"Images span                      %1.0f days" % (span_years * 365.25))

Loading

`get_dataframe_path(root_dataframe, class_dataset)`

Creates path to the pickled dataframe.

Parameters:

Name	Type	Description	Default
`root_dataframe`	`str`	Path where all dataframes are stored.	required
`class_dataset`	`type`	Type of WildlifeDataset.	required

Returns:

Type	Description
`str`	Path to the dataframe.

Source code in wildlife_datasets/loader/loader.py

def get_dataframe_path(root_dataframe: str, class_dataset: type) -> str:
    """Creates path to the pickled dataframe.

    Args:
        root_dataframe (str): Path where all dataframes are stored.
        class_dataset (type): Type of WildlifeDataset.

    Returns:
        Path to the dataframe.
    """

    return os.path.join(root_dataframe, class_dataset.__name__ + '.pkl')

`get_dataset_folder(root_dataset, class_dataset)`

Creates path to the dataset data.

Parameters:

Name	Type	Description	Default
`root_dataset`	`str`	Path where all datasets are stored.	required
`class_dataset`	`type`	Type of WildlifeDataset.	required

Returns:

Type	Description
`str`	Path to the stored data.

Source code in wildlife_datasets/loader/loader.py

def get_dataset_folder(root_dataset: str, class_dataset: type) -> str:
    """Creates path to the dataset data.

    Args:
        root_dataset (str): Path where all datasets are stored.
        class_dataset (type): Type of WildlifeDataset.

    Returns:
        Path to the stored data.
    """

    return os.path.join(root_dataset, class_dataset.display_name())

`load_dataset(class_dataset, root_dataset, root_dataframe, overwrite=False, **kwargs)`

Loads dataset from a pickled dataframe or creates it.

If the dataframe is already saved in a pkl file, it loads it. Otherwise, it creates the dataframe and saves it in a pkl file.

Parameters:

Name	Type	Description	Default
`class_dataset`	`type`	Type of WildlifeDataset to load.	required
`root_dataset`	`str`	Path where all datasets are stored.	required
`root_dataframe`	`str`	Path where all dataframes are stored.	required
`overwrite`	`bool`	Whether the pickled dataframe should be overwritten.	`False`

Returns:

Type	Description
`WildlifeDataset`	The loaded dataset.

Source code in wildlife_datasets/loader/loader.py

def load_dataset(
        class_dataset: type,
        root_dataset: str,
        root_dataframe: str,
        overwrite: bool = False,
        **kwargs
        ) -> WildlifeDataset:
    """Loads dataset from a pickled dataframe or creates it.

    If the dataframe is already saved in a pkl file, it loads it.
    Otherwise, it creates the dataframe and saves it in a pkl file.

    Args:
        class_dataset (type): Type of WildlifeDataset to load.
        root_dataset (str): Path where all datasets are stored.
        root_dataframe (str): Path where all dataframes are stored.
        overwrite (bool, optional): Whether the pickled dataframe should be overwritten.

    Returns:
        The loaded dataset.
    """

    # Check if the dataset is downloaded.
    if not os.path.exists(root_dataset):
        raise(Exception('Data not found. Download them first.'))

    # Get paths of the dataset and the pickled dataframe
    root = get_dataset_folder(root_dataset, class_dataset)
    df_path = get_dataframe_path(root_dataframe, class_dataset)
    if not class_dataset.determined_by_df:
        # Create the dataframe, no point in saving as it is not determined by it
        dataset = class_dataset(root, None, **kwargs)
    elif overwrite or not os.path.exists(df_path):
        # Create the dataframe, save it and create the dataset
        dataset = class_dataset(root, None, **kwargs)
        if not os.path.exists(root_dataframe):
            os.makedirs(root_dataframe)
        dataset.df.to_pickle(df_path)
    else:
        # Load the dataframe and create the dataset
        df = pd.read_pickle(df_path)
        dataset = class_dataset(root, df, **kwargs)
    return dataset

`load_datasets(class_datasets, root_dataset, root_dataframe, **kwargs)`

Loads multiple datasets as described in load_dataset.

Parameters:

Name	Type	Description	Default
`class_datasets`	`List[type]`	List of types of WildlifeDataset to download.	required
`root_dataset`	`str`	Path where all datasets are stored.	required
`root_dataframe`	`str`	Path where all dataframes are stored.	required

Returns:

Type	Description
`List[WildlifeDataset]`	The list of loaded datasets.

Source code in wildlife_datasets/loader/loader.py

def load_datasets(
        class_datasets: List[type],
        root_dataset: str,
        root_dataframe: str,
        **kwargs
        ) -> List[WildlifeDataset]:
    """Loads multiple datasets as described in `load_dataset`.

    Args:
        class_datasets (List[type]): List of types of WildlifeDataset to download.
        root_dataset (str): Path where all datasets are stored.
        root_dataframe (str): Path where all dataframes are stored.

    Returns:
        The list of loaded datasets.
    """

    return [load_dataset(class_dataset, root_dataset, root_dataframe, **kwargs) for class_dataset in class_datasets]