Skip to content

Reference utils

This file describes methods associated with dataset analysis and loading.

Analysis

compute_span(df)

Compute the time span of the dataset.

The span is defined as the latest time minus the earliest time of image taken. The times are computed separately for each individual.

Parameters:

Name Type Description Default
df DataFrame

A full dataframe of the data.

required

Returns:

Type Description
float

The span of the dataset in seconds.

Source code in wildlife_datasets/analysis/statistics.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def compute_span(df: pd.DataFrame) -> float:
    """Compute the time span of the dataset.

    The span is defined as the latest time minus the earliest time of image taken.
    The times are computed separately for each individual.

    Args:
        df (pd.DataFrame): A full dataframe of the data.

    Returns:
        The span of the dataset in seconds.
    """

    # Convert the dates into timedelta
    df = df.loc[~df['date'].isnull()]
    dates = pd.to_datetime(df['date']).to_numpy()

    # Find the maximal span across individuals
    identities = df['identity'].unique()
    span_seconds = -np.inf
    for identity in identities:
        idx = df['identity'] == identity
        span_seconds = np.maximum(span_seconds, (max(dates[idx]) - min(dates[idx])) / np.timedelta64(1, 's'))
    return span_seconds

display_statistics(df, unknown_name='')

Prints statistics about the dataframe.

Parameters:

Name Type Description Default
df DataFrame

A full dataframe of the data.

required
unknown_name str

Name of the unknown class.

''
Source code in wildlife_datasets/analysis/statistics.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def display_statistics(df: pd.DataFrame, unknown_name: str = '') -> None:
    """Prints statistics about the dataframe.

    Args:
        df (pd.DataFrame): A full dataframe of the data.
        unknown_name (str, optional): Name of the unknown class.
    """

    # Remove the unknown identities
    df_red = df.loc[df['identity'] != unknown_name, 'identity']
    df_red.value_counts().reset_index(drop=True).plot(xlabel='identities', ylabel='counts')

    # Compute the total number of identities
    if unknown_name in list(df['identity'].unique()):
        n_identity = len(df.identity.unique()) - 1
    else:
        n_identity = len(df.identity.unique())    
    n_one = len(df.groupby('identity').filter(lambda x : len(x) == 1))
    n_unidentified = sum(df['identity'] == unknown_name)

    # Print general statistics
    print(f"Number of identitites            {n_identity}")
    print(f"Number of all animals            {len(df)}")
    print(f"Number of animals with one image {n_one}")
    print(f"Number of unidentified animals   {n_unidentified}")

    # Print statistics about video if present
    if 'video' in df.columns:
        print(f"Number of videos                 {len(df[['identity', 'video']].drop_duplicates())}")

    # Print statistics about time span if present
    if 'date' in df.columns:
        span_years = compute_span(df) / (60*60*24*365.25)
        if span_years > 1:
            print(f"Images span                      %1.1f years" % (span_years))
        elif span_years / 12 > 1:
            print(f"Images span                      %1.1f months" % (span_years * 12))
        else:
            print(f"Images span                      %1.0f days" % (span_years * 365.25))

Loading

get_dataframe_path(root_dataframe, class_dataset)

Creates path to the pickled dataframe.

Parameters:

Name Type Description Default
root_dataframe str

Path where all dataframes are stored.

required
class_dataset type

Type of DatasetFactory.

required

Returns:

Type Description
str

Path to the dataframe.

Source code in wildlife_datasets/loader/loader.py
20
21
22
23
24
25
26
27
28
29
30
31
def get_dataframe_path(root_dataframe: str, class_dataset: type) -> str:
    """Creates path to the pickled dataframe.

    Args:
        root_dataframe (str): Path where all dataframes are stored.
        class_dataset (type): Type of DatasetFactory.

    Returns:
        Path to the dataframe.
    """

    return os.path.join(root_dataframe, class_dataset.__name__ + '.pkl')

get_dataset_folder(root_dataset, class_dataset)

Creates path to the dataset data.

Parameters:

Name Type Description Default
root_dataset str

Path where all datasets are stored.

required
class_dataset type

Type of DatasetFactory.

required

Returns:

Type Description
str

Path to the stored data.

Source code in wildlife_datasets/loader/loader.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
def get_dataset_folder(root_dataset: str, class_dataset: type) -> str:
    """Creates path to the dataset data.

    Args:
        root_dataset (str): Path where all datasets are stored.
        class_dataset (type): Type of DatasetFactory.

    Returns:
        Path to the stored data.
    """

    return os.path.join(root_dataset, class_dataset.display_name())

load_dataset(class_dataset, root_dataset, root_dataframe, overwrite=False, **kwargs)

Loads dataset from a pickled dataframe or creates it.

If the dataframe is already saved in a pkl file, it loads it. Otherwise, it creates the dataframe and saves it in a pkl file.

Parameters:

Name Type Description Default
class_dataset type

Type of DatasetFactory to load.

required
root_dataset str

Path where all datasets are stored.

required
root_dataframe str

Path where all dataframes are stored.

required
overwrite bool

Whether the pickled dataframe should be overwritten.

False

Returns:

Type Description
DatasetFactory

The loaded dataset.

Source code in wildlife_datasets/loader/loader.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def load_dataset(
        class_dataset: type,
        root_dataset: str,
        root_dataframe: str,
        overwrite: bool = False,
        **kwargs
        ) -> DatasetFactory:
    """Loads dataset from a pickled dataframe or creates it.

    If the dataframe is already saved in a pkl file, it loads it.
    Otherwise, it creates the dataframe and saves it in a pkl file.

    Args:
        class_dataset (type): Type of DatasetFactory to load.
        root_dataset (str): Path where all datasets are stored.
        root_dataframe (str): Path where all dataframes are stored.
        overwrite (bool, optional): Whether the pickled dataframe should be overwritten.

    Returns:
        The loaded dataset.
    """

    # Check if the dataset is downloaded.
    if not os.path.exists(root_dataset):
        raise(Exception('Data not found. Download them first.'))

    # Get paths of the dataset and the pickled dataframe
    root = get_dataset_folder(root_dataset, class_dataset)
    df_path = get_dataframe_path(root_dataframe, class_dataset)
    if not class_dataset.determined_by_df:
        # Create the dataframe, no point in saving as it is not determined by it
        dataset = class_dataset(root, None, **kwargs)
    elif overwrite or not os.path.exists(df_path):
        # Create the dataframe, save it and create the dataset
        dataset = class_dataset(root, None, **kwargs)
        if not os.path.exists(root_dataframe):
            os.makedirs(root_dataframe)
        dataset.df.to_pickle(df_path)
    else:
        # Load the dataframe and create the dataset
        df = pd.read_pickle(df_path)
        dataset = class_dataset(root, df, **kwargs)
    return dataset

load_datasets(class_datasets, root_dataset, root_dataframe, **kwargs)

Loads multiple datasets as described in load_dataset.

Parameters:

Name Type Description Default
class_datasets List[type]

List of types of DatasetFactory to download.

required
root_dataset str

Path where all datasets are stored.

required
root_dataframe str

Path where all dataframes are stored.

required

Returns:

Type Description
List[DatasetFactory]

The list of loaded datasets.

Source code in wildlife_datasets/loader/loader.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def load_datasets(
        class_datasets: List[type],
        root_dataset: str,
        root_dataframe: str,
        **kwargs
        ) -> List[DatasetFactory]:
    """Loads multiple datasets as described in `load_dataset`.

    Args:
        class_datasets (List[type]): List of types of DatasetFactory to download.
        root_dataset (str): Path where all datasets are stored.
        root_dataframe (str): Path where all dataframes are stored.

    Returns:
        The list of loaded datasets.
    """

    return [load_dataset(class_dataset, root_dataset, root_dataframe, **kwargs) for class_dataset in class_datasets]