Skip to content

Reference utils

This file describes methods associated with dataset analysis and loading.

Analysis

compute_span(df, col_label='identity')

Compute the time span of the dataset.

The span is defined as the latest time minus the earliest time of image taken. The times are computed separately for each individual.

Parameters:

Name Type Description Default
df DataFrame

A full dataframe of the data.

required
col_label str

Column name containing individual animal names (labels).

'identity'

Returns:

Type Description
float

The span of the dataset in seconds.

Source code in wildlife_datasets/analysis/statistics.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def compute_span(df: pd.DataFrame, col_label: str = 'identity') -> float:
    """Compute the time span of the dataset.

    The span is defined as the latest time minus the earliest time of image taken.
    The times are computed separately for each individual.

    Args:
        df (pd.DataFrame): A full dataframe of the data.
        col_label (str, optional): Column name containing individual animal names (labels).

    Returns:
        The span of the dataset in seconds.
    """

    # Convert the dates into timedelta
    df = df.loc[~df['date'].isnull()]
    dates = pd.to_datetime(df['date']).to_numpy()

    # Find the maximal span across individuals
    identities = df[col_label].unique()
    span_seconds = -np.inf
    for identity in identities:
        idx = df[col_label] == identity
        span_seconds = np.maximum(span_seconds, (max(dates[idx]) - min(dates[idx])) / np.timedelta64(1, 's'))
    return span_seconds

display_statistics(df, unknown_name='', col_label='identity')

Prints statistics about the dataframe.

Parameters:

Name Type Description Default
df DataFrame

A full dataframe of the data.

required
unknown_name str

Name of the unknown class.

''
col_label str

Column name containing individual animal names (labels).

'identity'
Source code in wildlife_datasets/analysis/statistics.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def display_statistics(
        df: pd.DataFrame,
        unknown_name: str = '',
        col_label: str = 'identity',
        ) -> None:
    """Prints statistics about the dataframe.

    Args:
        df (pd.DataFrame): A full dataframe of the data.
        unknown_name (str, optional): Name of the unknown class.
        col_label (str, optional): Column name containing individual animal names (labels).
    """

    # Remove the unknown identities
    df_red = df.loc[df[col_label] != unknown_name, col_label]
    df_red.value_counts().reset_index(drop=True).plot(xlabel='identities', ylabel='counts')

    # Compute the total number of identities
    if unknown_name in list(df[col_label].unique()):
        n_identity = len(df.identity.unique()) - 1
    else:
        n_identity = len(df.identity.unique())    
    n_one = len(df.groupby(col_label).filter(lambda x : len(x) == 1))
    n_unidentified = sum(df[col_label] == unknown_name)

    # Print general statistics
    print(f"Number of identitites            {n_identity}")
    print(f"Number of all animals            {len(df)}")
    print(f"Number of animals with one image {n_one}")
    print(f"Number of unidentified animals   {n_unidentified}")

    # Print statistics about video if present
    if 'video' in df.columns:
        print(f"Number of videos                 {len(df[[col_label, 'video']].drop_duplicates())}")

    # Print statistics about time span if present
    if 'date' in df.columns:
        span_years = compute_span(df, col_label=col_label) / (60*60*24*365.25)
        if span_years > 1:
            print(f"Images span                      %1.1f years" % (span_years))
        elif span_years / 12 > 1:
            print(f"Images span                      %1.1f months" % (span_years * 12))
        else:
            print(f"Images span                      %1.0f days" % (span_years * 365.25))

Loading

get_dataframe_path(root_dataframe, class_dataset)

Creates path to the pickled dataframe.

Parameters:

Name Type Description Default
root_dataframe str

Path where all dataframes are stored.

required
class_dataset type

Type of WildlifeDataset.

required

Returns:

Type Description
str

Path to the dataframe.

Source code in wildlife_datasets/loader/loader.py
20
21
22
23
24
25
26
27
28
29
30
31
def get_dataframe_path(root_dataframe: str, class_dataset: type) -> str:
    """Creates path to the pickled dataframe.

    Args:
        root_dataframe (str): Path where all dataframes are stored.
        class_dataset (type): Type of WildlifeDataset.

    Returns:
        Path to the dataframe.
    """

    return os.path.join(root_dataframe, class_dataset.__name__ + '.pkl')

get_dataset_folder(root_dataset, class_dataset)

Creates path to the dataset data.

Parameters:

Name Type Description Default
root_dataset str

Path where all datasets are stored.

required
class_dataset type

Type of WildlifeDataset.

required

Returns:

Type Description
str

Path to the stored data.

Source code in wildlife_datasets/loader/loader.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
def get_dataset_folder(root_dataset: str, class_dataset: type) -> str:
    """Creates path to the dataset data.

    Args:
        root_dataset (str): Path where all datasets are stored.
        class_dataset (type): Type of WildlifeDataset.

    Returns:
        Path to the stored data.
    """

    return os.path.join(root_dataset, class_dataset.display_name())

load_dataset(class_dataset, root_dataset, root_dataframe, overwrite=False, **kwargs)

Loads dataset from a pickled dataframe or creates it.

If the dataframe is already saved in a pkl file, it loads it. Otherwise, it creates the dataframe and saves it in a pkl file.

Parameters:

Name Type Description Default
class_dataset type

Type of WildlifeDataset to load.

required
root_dataset str

Path where all datasets are stored.

required
root_dataframe str

Path where all dataframes are stored.

required
overwrite bool

Whether the pickled dataframe should be overwritten.

False

Returns:

Type Description
WildlifeDataset

The loaded dataset.

Source code in wildlife_datasets/loader/loader.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def load_dataset(
        class_dataset: type,
        root_dataset: str,
        root_dataframe: str,
        overwrite: bool = False,
        **kwargs
        ) -> WildlifeDataset:
    """Loads dataset from a pickled dataframe or creates it.

    If the dataframe is already saved in a pkl file, it loads it.
    Otherwise, it creates the dataframe and saves it in a pkl file.

    Args:
        class_dataset (type): Type of WildlifeDataset to load.
        root_dataset (str): Path where all datasets are stored.
        root_dataframe (str): Path where all dataframes are stored.
        overwrite (bool, optional): Whether the pickled dataframe should be overwritten.

    Returns:
        The loaded dataset.
    """

    # Check if the dataset is downloaded.
    if not os.path.exists(root_dataset):
        raise(Exception('Data not found. Download them first.'))

    # Get paths of the dataset and the pickled dataframe
    root = get_dataset_folder(root_dataset, class_dataset)
    df_path = get_dataframe_path(root_dataframe, class_dataset)
    if not class_dataset.determined_by_df:
        # Create the dataframe, no point in saving as it is not determined by it
        dataset = class_dataset(root, None, **kwargs)
    elif overwrite or not os.path.exists(df_path):
        # Create the dataframe, save it and create the dataset
        dataset = class_dataset(root, None, **kwargs)
        if not os.path.exists(root_dataframe):
            os.makedirs(root_dataframe)
        dataset.df.to_pickle(df_path)
    else:
        # Load the dataframe and create the dataset
        df = pd.read_pickle(df_path)
        dataset = class_dataset(root, df, **kwargs)
    return dataset

load_datasets(class_datasets, root_dataset, root_dataframe, **kwargs)

Loads multiple datasets as described in load_dataset.

Parameters:

Name Type Description Default
class_datasets List[type]

List of types of WildlifeDataset to download.

required
root_dataset str

Path where all datasets are stored.

required
root_dataframe str

Path where all dataframes are stored.

required

Returns:

Type Description
List[WildlifeDataset]

The list of loaded datasets.

Source code in wildlife_datasets/loader/loader.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def load_datasets(
        class_datasets: List[type],
        root_dataset: str,
        root_dataframe: str,
        **kwargs
        ) -> List[WildlifeDataset]:
    """Loads multiple datasets as described in `load_dataset`.

    Args:
        class_datasets (List[type]): List of types of WildlifeDataset to download.
        root_dataset (str): Path where all datasets are stored.
        root_dataframe (str): Path where all dataframes are stored.

    Returns:
        The list of loaded datasets.
    """

    return [load_dataset(class_dataset, root_dataset, root_dataframe, **kwargs) for class_dataset in class_datasets]