Skip to content

Reference splitting functions

This file describes methods associated with dataset splitting.

Balanced split

Base class for splitting datasets into training and testing sets.

Implements methods from this paper. Its subclasses need to implement the split method. It should perform balanced splits separately for all classes. Its children are IdentitySplit and TimeAwareSplit. IdentitySplit has children ClosedSetSplit, OpenSetSplit and DisjointSetSplit. TimeAwareSplit has children TimeProportionSplit and TimeCutoffSplit.

Source code in wildlife_datasets/splits/balanced_split.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class BalancedSplit():
    """Base class for splitting datasets into training and testing sets.

    Implements methods from [this paper](https://arxiv.org/abs/2211.10307).
    Its subclasses need to implement the `split` method.
    It should perform balanced splits separately for all classes.
    Its children are `IdentitySplit` and `TimeAwareSplit`.
    `IdentitySplit` has children `ClosedSetSplit`, `OpenSetSplit` and `DisjointSetSplit`.
    `TimeAwareSplit` has children `TimeProportionSplit` and `TimeCutoffSplit`.
    """

    def initialize_lcg(self) -> Lcg:
        """Returns the random number generator.

        Returns:
            The random number generator.
        """

        return Lcg(self.seed)

    def split(self, *args, **kwargs) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Splitting method which needs to be implemented by subclasses.

        It splits the dataframe `df` into labels `idx_train` and `idx_test`.
        The subdataset is obtained by `df.loc[idx_train]` (not `iloc`).

        Returns:
            List of splits. Each split is list of labels of the training and testing sets.
        """

        raise(NotImplementedError('Subclasses should implement this. \n You may want to use ClosedSetSplit instead of BalancedSplit.'))

    def resplit_by_features(
            self,
            df: pd.DataFrame,
            features: np.ndarray,
            idx_train: np.ndarray,
            n_max_cluster: int = 5,
            eps_min: float = 0.01,
            eps_max: float = 0.50,
            eps_step: float = 0.01,
            min_samples: int = 2,
            save_clusters_prefix: Optional[str] = None,
            ) -> Tuple[np.ndarray, np.ndarray]:

        """Creates a random re-split of an already existing split.

        The re-split is based on similarity of features.
        It runs DBSCAN with increasing eps (cluster radius) until
        the clusters are smaller than `n_max_cluster`.
        Then it puts of similar images into the training set.
        The rest is randomly split into training and testing sets.
        The re-split mimics the split as the training set contains
        the same number of samples for EACH individual.
        The same goes for the testing set.

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
            features (np.ndarray): An array of features with the same length as `df`.
            idx_train (np.ndarray): Labels of the training set.
            n_max_cluster (int, optional): Maximal size of cluster before `eps` stops increasing.
            eps_min (float, optional): Lower bound for epsilon.
            eps_max (float, optional): Upper bound for epsilon.
            eps_step (float, optional): Step for epsilon.
            min_samples (int, optional): Minimal cluster size.
            save_clusters_prefix (Optional[bool], optional): File name prefix for saving clusters.

        Returns:
            List of labels of the training and testing sets.
        """

        # Modify the dataframe if the function is present
        if hasattr(self, 'modify_df'):
            df = self.modify_df(df)

        # Initialize the random number generator
        lcg = self.initialize_lcg()

        # Determine how many images of each individual should be in the training set
        identity_train_counts = df.loc[idx_train]['identity'].value_counts()

        # Loop over individuals and create a split for each
        idx_train_new = []
        for identity, df_identity in tqdm(df.groupby('identity')):
            n_train = identity_train_counts.get(identity, 0)
            if len(df_identity) - n_train <= 1:
                # All or all but one samples into the training set
                idx_remaining = np.array(df_identity.index)
                idx_remaining = lcg.random_shuffle(idx_remaining)
                idx_train_identity = idx_remaining[:n_train]
            else:
                f = features[df.index.get_indexer(df_identity.index)]
                # Run DBScan with increasing eps until there are no clusters bigger than n_max_cluster 
                clusters_saved = None
                for eps in np.arange(eps_min, eps_max+eps_step, eps_step):
                    clustering = DBSCAN(eps=eps, min_samples=min_samples)
                    clustering.fit(f)
                    clusters = pd.Series(clustering.labels_)
                    clusters_counts = clusters.value_counts(sort=True)
                    # Check if the largest clusters (without outliers) is not too big
                    if clusters_counts.index[0] == -1:
                        clustering_failed = len(clusters_counts) > 1 and clusters_counts.iloc[1] > n_max_cluster
                    else:
                        clustering_failed = len(clusters_counts) == 1 or clusters_counts.iloc[0] > n_max_cluster
                    # If the largest cluster is not too big, save clustering nad continue
                    if not clustering_failed:
                        clusters_saved = clusters
                    else:
                        break

                # Save the clusters
                if save_clusters_prefix is not None:
                    df_save = pd.DataFrame({'cluster': clusters_saved.to_numpy()}, index=df_identity.index)
                    df_save.to_csv(f'{save_clusters_prefix}_{identity}.csv')

                # Add all the clusters into the training set
                idx_train_identity = []
                if clusters_saved is not None:
                    for cluster, df_cluster in pd.DataFrame({'cluster': clusters_saved}).groupby('cluster'):
                        # Check if the training set is not too big
                        if cluster != -1 and len(idx_train_identity) + len(df_cluster) <= n_train:
                            idx_train_identity += list(df_identity.index[df_cluster.index])

                # Distribute the remaining indices
                n_train_remaining = n_train - len(idx_train_identity)
                idx_remaining = np.array(list(set(df_identity.index) - set(idx_train_identity)))
                idx_remaining = lcg.random_shuffle(idx_remaining)
                idx_train_identity += list(idx_remaining[:n_train_remaining])
            idx_train_new += list(idx_train_identity)
        idx_test_new = list(set(df.index) - set(idx_train_new))
        return np.array(idx_train_new), np.array(idx_test_new)

initialize_lcg()

Returns the random number generator.

Returns:

Type Description
Lcg

The random number generator.

Source code in wildlife_datasets/splits/balanced_split.py
20
21
22
23
24
25
26
27
def initialize_lcg(self) -> Lcg:
    """Returns the random number generator.

    Returns:
        The random number generator.
    """

    return Lcg(self.seed)

resplit_by_features(df, features, idx_train, n_max_cluster=5, eps_min=0.01, eps_max=0.5, eps_step=0.01, min_samples=2, save_clusters_prefix=None)

Creates a random re-split of an already existing split.

The re-split is based on similarity of features. It runs DBSCAN with increasing eps (cluster radius) until the clusters are smaller than n_max_cluster. Then it puts of similar images into the training set. The rest is randomly split into training and testing sets. The re-split mimics the split as the training set contains the same number of samples for EACH individual. The same goes for the testing set.

Parameters:

Name Type Description Default
df DataFrame

A dataframe of the data. It must contain column identity.

required
features ndarray

An array of features with the same length as df.

required
idx_train ndarray

Labels of the training set.

required
n_max_cluster int

Maximal size of cluster before eps stops increasing.

5
eps_min float

Lower bound for epsilon.

0.01
eps_max float

Upper bound for epsilon.

0.5
eps_step float

Step for epsilon.

0.01
min_samples int

Minimal cluster size.

2
save_clusters_prefix Optional[bool]

File name prefix for saving clusters.

None

Returns:

Type Description
Tuple[ndarray, ndarray]

List of labels of the training and testing sets.

Source code in wildlife_datasets/splits/balanced_split.py
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def resplit_by_features(
        self,
        df: pd.DataFrame,
        features: np.ndarray,
        idx_train: np.ndarray,
        n_max_cluster: int = 5,
        eps_min: float = 0.01,
        eps_max: float = 0.50,
        eps_step: float = 0.01,
        min_samples: int = 2,
        save_clusters_prefix: Optional[str] = None,
        ) -> Tuple[np.ndarray, np.ndarray]:

    """Creates a random re-split of an already existing split.

    The re-split is based on similarity of features.
    It runs DBSCAN with increasing eps (cluster radius) until
    the clusters are smaller than `n_max_cluster`.
    Then it puts of similar images into the training set.
    The rest is randomly split into training and testing sets.
    The re-split mimics the split as the training set contains
    the same number of samples for EACH individual.
    The same goes for the testing set.

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
        features (np.ndarray): An array of features with the same length as `df`.
        idx_train (np.ndarray): Labels of the training set.
        n_max_cluster (int, optional): Maximal size of cluster before `eps` stops increasing.
        eps_min (float, optional): Lower bound for epsilon.
        eps_max (float, optional): Upper bound for epsilon.
        eps_step (float, optional): Step for epsilon.
        min_samples (int, optional): Minimal cluster size.
        save_clusters_prefix (Optional[bool], optional): File name prefix for saving clusters.

    Returns:
        List of labels of the training and testing sets.
    """

    # Modify the dataframe if the function is present
    if hasattr(self, 'modify_df'):
        df = self.modify_df(df)

    # Initialize the random number generator
    lcg = self.initialize_lcg()

    # Determine how many images of each individual should be in the training set
    identity_train_counts = df.loc[idx_train]['identity'].value_counts()

    # Loop over individuals and create a split for each
    idx_train_new = []
    for identity, df_identity in tqdm(df.groupby('identity')):
        n_train = identity_train_counts.get(identity, 0)
        if len(df_identity) - n_train <= 1:
            # All or all but one samples into the training set
            idx_remaining = np.array(df_identity.index)
            idx_remaining = lcg.random_shuffle(idx_remaining)
            idx_train_identity = idx_remaining[:n_train]
        else:
            f = features[df.index.get_indexer(df_identity.index)]
            # Run DBScan with increasing eps until there are no clusters bigger than n_max_cluster 
            clusters_saved = None
            for eps in np.arange(eps_min, eps_max+eps_step, eps_step):
                clustering = DBSCAN(eps=eps, min_samples=min_samples)
                clustering.fit(f)
                clusters = pd.Series(clustering.labels_)
                clusters_counts = clusters.value_counts(sort=True)
                # Check if the largest clusters (without outliers) is not too big
                if clusters_counts.index[0] == -1:
                    clustering_failed = len(clusters_counts) > 1 and clusters_counts.iloc[1] > n_max_cluster
                else:
                    clustering_failed = len(clusters_counts) == 1 or clusters_counts.iloc[0] > n_max_cluster
                # If the largest cluster is not too big, save clustering nad continue
                if not clustering_failed:
                    clusters_saved = clusters
                else:
                    break

            # Save the clusters
            if save_clusters_prefix is not None:
                df_save = pd.DataFrame({'cluster': clusters_saved.to_numpy()}, index=df_identity.index)
                df_save.to_csv(f'{save_clusters_prefix}_{identity}.csv')

            # Add all the clusters into the training set
            idx_train_identity = []
            if clusters_saved is not None:
                for cluster, df_cluster in pd.DataFrame({'cluster': clusters_saved}).groupby('cluster'):
                    # Check if the training set is not too big
                    if cluster != -1 and len(idx_train_identity) + len(df_cluster) <= n_train:
                        idx_train_identity += list(df_identity.index[df_cluster.index])

            # Distribute the remaining indices
            n_train_remaining = n_train - len(idx_train_identity)
            idx_remaining = np.array(list(set(df_identity.index) - set(idx_train_identity)))
            idx_remaining = lcg.random_shuffle(idx_remaining)
            idx_train_identity += list(idx_remaining[:n_train_remaining])
        idx_train_new += list(idx_train_identity)
    idx_test_new = list(set(df.index) - set(idx_train_new))
    return np.array(idx_train_new), np.array(idx_test_new)

split(*args, **kwargs)

Splitting method which needs to be implemented by subclasses.

It splits the dataframe df into labels idx_train and idx_test. The subdataset is obtained by df.loc[idx_train] (not iloc).

Returns:

Type Description
List[Tuple[ndarray, ndarray]]

List of splits. Each split is list of labels of the training and testing sets.

Source code in wildlife_datasets/splits/balanced_split.py
29
30
31
32
33
34
35
36
37
38
39
def split(self, *args, **kwargs) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Splitting method which needs to be implemented by subclasses.

    It splits the dataframe `df` into labels `idx_train` and `idx_test`.
    The subdataset is obtained by `df.loc[idx_train]` (not `iloc`).

    Returns:
        List of splits. Each split is list of labels of the training and testing sets.
    """

    raise(NotImplementedError('Subclasses should implement this. \n You may want to use ClosedSetSplit instead of BalancedSplit.'))

Identity split

Bases: BalancedSplit

Base class for ClosedSetSplit, OpenSetSplit and DisjointSetSplit.

Source code in wildlife_datasets/splits/identity_split.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class IdentitySplit(BalancedSplit):
    """Base class for `ClosedSetSplit`, `OpenSetSplit` and `DisjointSetSplit`.
    """

    def modify_df(self, df: pd.DataFrame) -> pd.DataFrame:
        """Prepares dataframe for splits.

        Removes identities specified in `self.identity_skip` (usually unknown identities).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

        Returns:
            Modified dataframe of the data.
        """

        df = df.copy()
        df = df[df['identity'] != self.identity_skip]
        return df

    def general_split(
            self,
            df: pd.DataFrame,
            individual_train: List[str],
            individual_test: List[str],
            ) -> Tuple[np.ndarray, np.ndarray]:
        """General-purpose split into the training and testing sets.

        It puts all samples of `individual_train` into the training set
        and all samples of `individual_test` into the testing set.
        The splitting is performed for each individual separately.
        The split will result in at least one sample in both the training and testing sets.
        If only one sample is available for an individual, it will be in the training set.

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
            individual_train (List[str]): Individuals to be only in the training test.
            individual_test (List[str]): Individuals to be only in the testing test.

        Returns:
            List of labels of the training and testing sets.
        """

        # Initialize the random number generator
        lcg = self.initialize_lcg()

        # Compute how many samples go automatically to the training and testing sets
        y_counts = df['identity'].value_counts()
        n_train = sum([y_counts.loc[y] for y in individual_train])
        n_test = sum([y_counts.loc[y] for y in individual_test])

        # Recompute ratio_train and adjust it to proper bounds
        n = len(df)
        ratio_train = self.ratio_train
        if n_train + n_test > 0 and n_train + n_test < n:
            ratio_train = (n*ratio_train - n_train) / (n - n_test - n_train)
        ratio_train = np.clip(ratio_train, 0, 1)

        idx_train = []
        idx_test = []        
        # Make a loop over all individuals
        for individual, df_individual in df.groupby('identity'):
            if individual in individual_train and individual in individual_test:
                # Check if the class does not belong to both sets
                raise(Exception('Individual cannot be both in individual_train and individual_test.'))
            elif individual in individual_train:
                # Check if the class does not belong to the training set
                idx_train += list(df_individual.index)
            elif individual in individual_test:
                # Check if the class does not belong to the testing set
                idx_test += list(df_individual.index)
            else:
                # Otherwise compute the number of samples in the training set
                n_individual = len(df_individual)
                n_train = np.round(ratio_train * n_individual).astype(int)
                if n_train == n_individual and n_train > 1:
                    n_train -= 1
                if n_train == 0:
                    n_train = 1
                # Create indices to the training set and randomly permute them                
                idx_permutation = lcg.random_permutation(n_individual)
                idx_permutation = np.array(idx_permutation)
                idx_train += list(df_individual.index[idx_permutation[:n_train]])
                idx_test += list(df_individual.index[idx_permutation[n_train:]])
        return np.array(idx_train), np.array(idx_test)

general_split(df, individual_train, individual_test)

General-purpose split into the training and testing sets.

It puts all samples of individual_train into the training set and all samples of individual_test into the testing set. The splitting is performed for each individual separately. The split will result in at least one sample in both the training and testing sets. If only one sample is available for an individual, it will be in the training set.

Parameters:

Name Type Description Default
df DataFrame

A dataframe of the data. It must contain column identity.

required
individual_train List[str]

Individuals to be only in the training test.

required
individual_test List[str]

Individuals to be only in the testing test.

required

Returns:

Type Description
Tuple[ndarray, ndarray]

List of labels of the training and testing sets.

Source code in wildlife_datasets/splits/identity_split.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def general_split(
        self,
        df: pd.DataFrame,
        individual_train: List[str],
        individual_test: List[str],
        ) -> Tuple[np.ndarray, np.ndarray]:
    """General-purpose split into the training and testing sets.

    It puts all samples of `individual_train` into the training set
    and all samples of `individual_test` into the testing set.
    The splitting is performed for each individual separately.
    The split will result in at least one sample in both the training and testing sets.
    If only one sample is available for an individual, it will be in the training set.

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
        individual_train (List[str]): Individuals to be only in the training test.
        individual_test (List[str]): Individuals to be only in the testing test.

    Returns:
        List of labels of the training and testing sets.
    """

    # Initialize the random number generator
    lcg = self.initialize_lcg()

    # Compute how many samples go automatically to the training and testing sets
    y_counts = df['identity'].value_counts()
    n_train = sum([y_counts.loc[y] for y in individual_train])
    n_test = sum([y_counts.loc[y] for y in individual_test])

    # Recompute ratio_train and adjust it to proper bounds
    n = len(df)
    ratio_train = self.ratio_train
    if n_train + n_test > 0 and n_train + n_test < n:
        ratio_train = (n*ratio_train - n_train) / (n - n_test - n_train)
    ratio_train = np.clip(ratio_train, 0, 1)

    idx_train = []
    idx_test = []        
    # Make a loop over all individuals
    for individual, df_individual in df.groupby('identity'):
        if individual in individual_train and individual in individual_test:
            # Check if the class does not belong to both sets
            raise(Exception('Individual cannot be both in individual_train and individual_test.'))
        elif individual in individual_train:
            # Check if the class does not belong to the training set
            idx_train += list(df_individual.index)
        elif individual in individual_test:
            # Check if the class does not belong to the testing set
            idx_test += list(df_individual.index)
        else:
            # Otherwise compute the number of samples in the training set
            n_individual = len(df_individual)
            n_train = np.round(ratio_train * n_individual).astype(int)
            if n_train == n_individual and n_train > 1:
                n_train -= 1
            if n_train == 0:
                n_train = 1
            # Create indices to the training set and randomly permute them                
            idx_permutation = lcg.random_permutation(n_individual)
            idx_permutation = np.array(idx_permutation)
            idx_train += list(df_individual.index[idx_permutation[:n_train]])
            idx_test += list(df_individual.index[idx_permutation[n_train:]])
    return np.array(idx_train), np.array(idx_test)

modify_df(df)

Prepares dataframe for splits.

Removes identities specified in self.identity_skip (usually unknown identities).

Parameters:

Name Type Description Default
df DataFrame

A dataframe of the data. It must contain columns identity and date.

required

Returns:

Type Description
DataFrame

Modified dataframe of the data.

Source code in wildlife_datasets/splits/identity_split.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def modify_df(self, df: pd.DataFrame) -> pd.DataFrame:
    """Prepares dataframe for splits.

    Removes identities specified in `self.identity_skip` (usually unknown identities).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

    Returns:
        Modified dataframe of the data.
    """

    df = df.copy()
    df = df[df['identity'] != self.identity_skip]
    return df

Closed-set split

Bases: IdentitySplit

Closed-set splitting method into training and testing sets.

All individuals are both in the training and testing set. The only exception is that individuals with only one sample are in the training set. Implementation of this paper.

Source code in wildlife_datasets/splits/identity_split.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class ClosedSetSplit(IdentitySplit):
    """Closed-set splitting method into training and testing sets.

    All individuals are both in the training and testing set.
    The only exception is that individuals with only one sample are in the training set.
    Implementation of [this paper](https://arxiv.org/abs/2211.10307).
    """

    def __init__(
            self,
            ratio_train: float,
            seed: int = 666,
            identity_skip: str = 'unknown',
            ) -> None:
        """Initializes the class.

        Args:
            ratio_train (float): *Approximate* size of the training set.
            seed (int, optional): Initial seed for the LCG random generator.
            identity_skip (str, optional): Name of the identities to ignore.
        """

        self.ratio_train = ratio_train
        self.identity_skip = identity_skip
        self.seed = seed

    def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

        Returns:
            List of splits. Each split is list of labels of the training and testing sets.
        """

        df = self.modify_df(df)
        individual_train = np.array([], dtype=object)
        individual_test = np.array([], dtype=object)
        return [self.general_split(df, individual_train, individual_test)]

__init__(ratio_train, seed=666, identity_skip='unknown')

Initializes the class.

Parameters:

Name Type Description Default
ratio_train float

Approximate size of the training set.

required
seed int

Initial seed for the LCG random generator.

666
identity_skip str

Name of the identities to ignore.

'unknown'
Source code in wildlife_datasets/splits/identity_split.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def __init__(
        self,
        ratio_train: float,
        seed: int = 666,
        identity_skip: str = 'unknown',
        ) -> None:
    """Initializes the class.

    Args:
        ratio_train (float): *Approximate* size of the training set.
        seed (int, optional): Initial seed for the LCG random generator.
        identity_skip (str, optional): Name of the identities to ignore.
    """

    self.ratio_train = ratio_train
    self.identity_skip = identity_skip
    self.seed = seed

split(df)

Implementation of the base splitting method.

Parameters:

Name Type Description Default
df DataFrame

A dataframe of the data. It must contain column identity.

required

Returns:

Type Description
List[Tuple[ndarray, ndarray]]

List of splits. Each split is list of labels of the training and testing sets.

Source code in wildlife_datasets/splits/identity_split.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

    Returns:
        List of splits. Each split is list of labels of the training and testing sets.
    """

    df = self.modify_df(df)
    individual_train = np.array([], dtype=object)
    individual_test = np.array([], dtype=object)
    return [self.general_split(df, individual_train, individual_test)]

Open-set split

Bases: IdentitySplit

Open-set splitting method into training and testing sets.

Some individuals are in the testing but not in the training set. Implementation of this paper.

Source code in wildlife_datasets/splits/identity_split.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
class OpenSetSplit(IdentitySplit):
    """Open-set splitting method into training and testing sets.

    Some individuals are in the testing but not in the training set.
    Implementation of [this paper](https://arxiv.org/abs/2211.10307).
    """

    def __init__(
            self,
            ratio_train: float,
            ratio_class_test: float = None,
            n_class_test: int = None,
            seed: int = 666,
            identity_skip: str = 'unknown',
            ) -> None:
        """Initializes the class.

        The user must provide exactly one from `ratio_class_test` and `n_class_test`.
        The latter specifies the number of individuals to be only in the testing set.
        The former specified the ratio of samples of individuals (not individuals themselves)
        to be only in the testing set.

        Args:
            ratio_train (float): *Approximate* size of the training set.
            ratio_class_test (float, optional): *Approximate* ratio of samples of individuals only in the testing set.
            n_class_test (int, optional): Number of individuals only in the testing set.
            seed (int, optional): Initial seed for the LCG random generator.
            identity_skip (str, optional): Name of the identities to ignore.
        """

        if ratio_class_test is None and n_class_test is None:
            raise(Exception('Either ratio_class_test or n_class_test must be provided.'))
        elif ratio_class_test is not None and n_class_test is not None:
            raise(Exception('Only ratio_class_test or n_class_test can be provided.'))

        self.ratio_train = ratio_train
        self.ratio_class_test = ratio_class_test
        self.n_class_test = n_class_test
        self.identity_skip = identity_skip
        self.seed = seed

    def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

        Returns:
            List of splits. Each split is list of labels of the training and testing sets.
        """

        df = self.modify_df(df)

        # Initialize the random number generator
        lcg = self.initialize_lcg()

        # Compute the counts and randomly permute them
        y_counts = df['identity'].value_counts()
        n_class = len(y_counts)
        idx = lcg.random_permutation(n_class)
        y_counts = y_counts.iloc[idx]

        # Compute number of identities in the testing set
        n = len(df)
        if self.n_class_test is None:
            n_test = np.round(n*self.ratio_class_test).astype(int)
            n_class_test = np.where(np.cumsum(y_counts) >= n_test)[0][0] + 1
        else:
            n_class_test = self.n_class_test

        # Specify individuals going purely into training and testing sets
        individual_train = np.array([], dtype=object)
        individual_test = np.array(y_counts.index[:n_class_test])
        return [self.general_split(df, individual_train, individual_test)]

__init__(ratio_train, ratio_class_test=None, n_class_test=None, seed=666, identity_skip='unknown')

Initializes the class.

The user must provide exactly one from ratio_class_test and n_class_test. The latter specifies the number of individuals to be only in the testing set. The former specified the ratio of samples of individuals (not individuals themselves) to be only in the testing set.

Parameters:

Name Type Description Default
ratio_train float

Approximate size of the training set.

required
ratio_class_test float

Approximate ratio of samples of individuals only in the testing set.

None
n_class_test int

Number of individuals only in the testing set.

None
seed int

Initial seed for the LCG random generator.

666
identity_skip str

Name of the identities to ignore.

'unknown'
Source code in wildlife_datasets/splits/identity_split.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def __init__(
        self,
        ratio_train: float,
        ratio_class_test: float = None,
        n_class_test: int = None,
        seed: int = 666,
        identity_skip: str = 'unknown',
        ) -> None:
    """Initializes the class.

    The user must provide exactly one from `ratio_class_test` and `n_class_test`.
    The latter specifies the number of individuals to be only in the testing set.
    The former specified the ratio of samples of individuals (not individuals themselves)
    to be only in the testing set.

    Args:
        ratio_train (float): *Approximate* size of the training set.
        ratio_class_test (float, optional): *Approximate* ratio of samples of individuals only in the testing set.
        n_class_test (int, optional): Number of individuals only in the testing set.
        seed (int, optional): Initial seed for the LCG random generator.
        identity_skip (str, optional): Name of the identities to ignore.
    """

    if ratio_class_test is None and n_class_test is None:
        raise(Exception('Either ratio_class_test or n_class_test must be provided.'))
    elif ratio_class_test is not None and n_class_test is not None:
        raise(Exception('Only ratio_class_test or n_class_test can be provided.'))

    self.ratio_train = ratio_train
    self.ratio_class_test = ratio_class_test
    self.n_class_test = n_class_test
    self.identity_skip = identity_skip
    self.seed = seed

split(df)

Implementation of the base splitting method.

Parameters:

Name Type Description Default
df DataFrame

A dataframe of the data. It must contain column identity.

required

Returns:

Type Description
List[Tuple[ndarray, ndarray]]

List of splits. Each split is list of labels of the training and testing sets.

Source code in wildlife_datasets/splits/identity_split.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

    Returns:
        List of splits. Each split is list of labels of the training and testing sets.
    """

    df = self.modify_df(df)

    # Initialize the random number generator
    lcg = self.initialize_lcg()

    # Compute the counts and randomly permute them
    y_counts = df['identity'].value_counts()
    n_class = len(y_counts)
    idx = lcg.random_permutation(n_class)
    y_counts = y_counts.iloc[idx]

    # Compute number of identities in the testing set
    n = len(df)
    if self.n_class_test is None:
        n_test = np.round(n*self.ratio_class_test).astype(int)
        n_class_test = np.where(np.cumsum(y_counts) >= n_test)[0][0] + 1
    else:
        n_class_test = self.n_class_test

    # Specify individuals going purely into training and testing sets
    individual_train = np.array([], dtype=object)
    individual_test = np.array(y_counts.index[:n_class_test])
    return [self.general_split(df, individual_train, individual_test)]

Disjoint-set split

Bases: IdentitySplit

Disjoint-set splitting method into training and testing sets.

No individuals are in both the training and testing sets. Implementation of this paper.

Source code in wildlife_datasets/splits/identity_split.py
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
class DisjointSetSplit(IdentitySplit):
    """Disjoint-set splitting method into training and testing sets.

    No individuals are in both the training and testing sets.
    Implementation of [this paper](https://arxiv.org/abs/2211.10307).
    """

    def __init__(
            self,
            ratio_class_test: float = None,
            n_class_test: int = None,
            seed: int = 666,
            identity_skip: str = 'unknown',
            ) -> None:
        """Initializes the class.

        The user must provide exactly one from `ratio_class_test` and `n_class_test`.
        The latter specifies the number of individuals to be only in the testing set.
        The former specified the ratio of samples of individuals (not individuals themselves)
        to be only in the testing set.

        Args:
            ratio_class_test (float, optional): *Approximate* ratio of samples of individuals only in the testing set.
            n_class_test (int, optional): Number of individuals only in the testing set.
            seed (int, optional): Initial seed for the LCG random generator.
            identity_skip (str, optional): Name of the identities to ignore.
        """

        if ratio_class_test is None and n_class_test is None:
            raise(Exception('Either ratio_class_test or n_class_test must be provided.'))
        elif ratio_class_test is not None and n_class_test is not None:
            raise(Exception('Only ratio_class_test or n_class_test can be provided.'))

        self.ratio_train = 0 # Arbitrary value
        self.ratio_class_test = ratio_class_test
        self.n_class_test = n_class_test
        self.identity_skip = identity_skip
        self.seed = seed

    def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

        Returns:
            List of splits. Each split is list of labels of the training and testing sets.
        """

        df = self.modify_df(df)

        # Initialize the random number generator
        lcg = self.initialize_lcg()

        # Compute the counts and randomly permute them
        y_counts = df['identity'].value_counts()
        n_class = len(y_counts)
        idx = lcg.random_permutation(n_class)
        y_counts = y_counts.iloc[idx]

        # Compute number of identities in the testing set
        n = len(df)
        if self.n_class_test is None:
            n_test = np.round(n*self.ratio_class_test).astype(int)
            n_class_test = np.where(np.cumsum(y_counts) >= n_test)[0][0] + 1
        else:
            n_class_test = self.n_class_test

        # Specify individuals going purely into training and testing sets
        individual_train = np.array(y_counts.index[n_class_test:])
        individual_test = np.array(y_counts.index[:n_class_test])
        return [self.general_split(df, individual_train, individual_test)]

__init__(ratio_class_test=None, n_class_test=None, seed=666, identity_skip='unknown')

Initializes the class.

The user must provide exactly one from ratio_class_test and n_class_test. The latter specifies the number of individuals to be only in the testing set. The former specified the ratio of samples of individuals (not individuals themselves) to be only in the testing set.

Parameters:

Name Type Description Default
ratio_class_test float

Approximate ratio of samples of individuals only in the testing set.

None
n_class_test int

Number of individuals only in the testing set.

None
seed int

Initial seed for the LCG random generator.

666
identity_skip str

Name of the identities to ignore.

'unknown'
Source code in wildlife_datasets/splits/identity_split.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
def __init__(
        self,
        ratio_class_test: float = None,
        n_class_test: int = None,
        seed: int = 666,
        identity_skip: str = 'unknown',
        ) -> None:
    """Initializes the class.

    The user must provide exactly one from `ratio_class_test` and `n_class_test`.
    The latter specifies the number of individuals to be only in the testing set.
    The former specified the ratio of samples of individuals (not individuals themselves)
    to be only in the testing set.

    Args:
        ratio_class_test (float, optional): *Approximate* ratio of samples of individuals only in the testing set.
        n_class_test (int, optional): Number of individuals only in the testing set.
        seed (int, optional): Initial seed for the LCG random generator.
        identity_skip (str, optional): Name of the identities to ignore.
    """

    if ratio_class_test is None and n_class_test is None:
        raise(Exception('Either ratio_class_test or n_class_test must be provided.'))
    elif ratio_class_test is not None and n_class_test is not None:
        raise(Exception('Only ratio_class_test or n_class_test can be provided.'))

    self.ratio_train = 0 # Arbitrary value
    self.ratio_class_test = ratio_class_test
    self.n_class_test = n_class_test
    self.identity_skip = identity_skip
    self.seed = seed

split(df)

Implementation of the base splitting method.

Parameters:

Name Type Description Default
df DataFrame

A dataframe of the data. It must contain column identity.

required

Returns:

Type Description
List[Tuple[ndarray, ndarray]]

List of splits. Each split is list of labels of the training and testing sets.

Source code in wildlife_datasets/splits/identity_split.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

    Returns:
        List of splits. Each split is list of labels of the training and testing sets.
    """

    df = self.modify_df(df)

    # Initialize the random number generator
    lcg = self.initialize_lcg()

    # Compute the counts and randomly permute them
    y_counts = df['identity'].value_counts()
    n_class = len(y_counts)
    idx = lcg.random_permutation(n_class)
    y_counts = y_counts.iloc[idx]

    # Compute number of identities in the testing set
    n = len(df)
    if self.n_class_test is None:
        n_test = np.round(n*self.ratio_class_test).astype(int)
        n_class_test = np.where(np.cumsum(y_counts) >= n_test)[0][0] + 1
    else:
        n_class_test = self.n_class_test

    # Specify individuals going purely into training and testing sets
    individual_train = np.array(y_counts.index[n_class_test:])
    individual_test = np.array(y_counts.index[:n_class_test])
    return [self.general_split(df, individual_train, individual_test)]

Time-aware split

Bases: BalancedSplit

Base class for TimeProportionSplit and TimeCutoffSplit.

Source code in wildlife_datasets/splits/time_aware_split.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
class TimeAwareSplit(BalancedSplit):
    """Base class for `TimeProportionSplit` and `TimeCutoffSplit`.
    """

    def modify_df(self, df: pd.DataFrame) -> pd.DataFrame:
        """Prepares dataframe for splits.

        Removes identities specified in `self.identity_skip` (usually unknown identities).
        Convert the `date` column into a unified format.
        Add the `year` column.

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

        Returns:
            Modified dataframe of the data.
        """

        # Check if the DataFrame contain the column date.
        if 'date' not in df.columns:
            raise(Exception('Dataframe df does not contain column date.'))

        # Remove identities to be skipped
        df = df.copy()
        df = df[df['identity'] != self.identity_skip]

        # Removes entries without dates
        df = df[~df['date'].isnull()]

        # Convert date to datetime format (from possibly strings) and drop hours
        df['date'] = pd.to_datetime(df['date']).apply(lambda x: x.date())
        df['year'] = df['date'].apply(lambda x: x.year).to_numpy()            
        return df

    def resplit_random(
            self,
            df: pd.DataFrame,
            idx_train: np.ndarray,
            idx_test: np.ndarray,
            year_max: int = np.inf
            ) -> Tuple[np.ndarray, np.ndarray]:
        """Creates a random re-split of an already existing split.

        The re-split mimics the split as the training set contains
        the same number of samples for EACH individual.
        The same goes for the testing set.
        The re-split samples may be drawn only from `df['year'] <= year_max`.

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.
            idx_train (np.ndarray): Labels of the training set.
            idx_test (np.ndarray): Labels of the testing set.
            year_max (int, optional): Considers only entries with `df['year'] <= year_max`.

        Returns:
            List of labels of the training and testing sets.
        """

        df = self.modify_df(df)

        # Initialize the random number generator
        lcg = self.initialize_lcg()

        # Compute the number of samples for each individual in the training set
        counts_train = {}
        for name, df_name in df.loc[idx_train].groupby('identity'):
            counts_train[name] = len(df_name)
        # Compute the number of samples for each individual in the testing set
        counts_test = {}
        for name, df_name in df.loc[idx_test].groupby('identity'):
            counts_test[name] = len(df_name)

        idx_train_new = []
        idx_test_new = []
        # Loop over all individuals
        for name, df_name in df.groupby('identity'):
            # Extract the number of individuals in the training and testing sets
            n_train = counts_train.get(name, 0)
            n_test = counts_test.get(name, 0)
            if n_train+n_test > 0:
                # Get randomly permuted indices of the corresponding identity
                df_name = df_name[df_name['year'] <= year_max]
                if len(df_name) < n_train+n_test:
                    raise(Exception('The set is too small.'))
                # Get the correct number of indices in both sets
                idx_permutation = lcg.random_permutation(n_train+n_test)
                idx_permutation = np.array(idx_permutation)
                idx_train_new += list(df_name.index[idx_permutation[:n_train]])
                idx_test_new += list(df_name.index[idx_permutation[n_train:n_train+n_test]])
        return np.array(idx_train_new), np.array(idx_test_new)

modify_df(df)

Prepares dataframe for splits.

Removes identities specified in self.identity_skip (usually unknown identities). Convert the date column into a unified format. Add the year column.

Parameters:

Name Type Description Default
df DataFrame

A dataframe of the data. It must contain columns identity and date.

required

Returns:

Type Description
DataFrame

Modified dataframe of the data.

Source code in wildlife_datasets/splits/time_aware_split.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def modify_df(self, df: pd.DataFrame) -> pd.DataFrame:
    """Prepares dataframe for splits.

    Removes identities specified in `self.identity_skip` (usually unknown identities).
    Convert the `date` column into a unified format.
    Add the `year` column.

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

    Returns:
        Modified dataframe of the data.
    """

    # Check if the DataFrame contain the column date.
    if 'date' not in df.columns:
        raise(Exception('Dataframe df does not contain column date.'))

    # Remove identities to be skipped
    df = df.copy()
    df = df[df['identity'] != self.identity_skip]

    # Removes entries without dates
    df = df[~df['date'].isnull()]

    # Convert date to datetime format (from possibly strings) and drop hours
    df['date'] = pd.to_datetime(df['date']).apply(lambda x: x.date())
    df['year'] = df['date'].apply(lambda x: x.year).to_numpy()            
    return df

resplit_random(df, idx_train, idx_test, year_max=np.inf)

Creates a random re-split of an already existing split.

The re-split mimics the split as the training set contains the same number of samples for EACH individual. The same goes for the testing set. The re-split samples may be drawn only from df['year'] <= year_max.

Parameters:

Name Type Description Default
df DataFrame

A dataframe of the data. It must contain columns identity and date.

required
idx_train ndarray

Labels of the training set.

required
idx_test ndarray

Labels of the testing set.

required
year_max int

Considers only entries with df['year'] <= year_max.

inf

Returns:

Type Description
Tuple[ndarray, ndarray]

List of labels of the training and testing sets.

Source code in wildlife_datasets/splits/time_aware_split.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def resplit_random(
        self,
        df: pd.DataFrame,
        idx_train: np.ndarray,
        idx_test: np.ndarray,
        year_max: int = np.inf
        ) -> Tuple[np.ndarray, np.ndarray]:
    """Creates a random re-split of an already existing split.

    The re-split mimics the split as the training set contains
    the same number of samples for EACH individual.
    The same goes for the testing set.
    The re-split samples may be drawn only from `df['year'] <= year_max`.

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.
        idx_train (np.ndarray): Labels of the training set.
        idx_test (np.ndarray): Labels of the testing set.
        year_max (int, optional): Considers only entries with `df['year'] <= year_max`.

    Returns:
        List of labels of the training and testing sets.
    """

    df = self.modify_df(df)

    # Initialize the random number generator
    lcg = self.initialize_lcg()

    # Compute the number of samples for each individual in the training set
    counts_train = {}
    for name, df_name in df.loc[idx_train].groupby('identity'):
        counts_train[name] = len(df_name)
    # Compute the number of samples for each individual in the testing set
    counts_test = {}
    for name, df_name in df.loc[idx_test].groupby('identity'):
        counts_test[name] = len(df_name)

    idx_train_new = []
    idx_test_new = []
    # Loop over all individuals
    for name, df_name in df.groupby('identity'):
        # Extract the number of individuals in the training and testing sets
        n_train = counts_train.get(name, 0)
        n_test = counts_test.get(name, 0)
        if n_train+n_test > 0:
            # Get randomly permuted indices of the corresponding identity
            df_name = df_name[df_name['year'] <= year_max]
            if len(df_name) < n_train+n_test:
                raise(Exception('The set is too small.'))
            # Get the correct number of indices in both sets
            idx_permutation = lcg.random_permutation(n_train+n_test)
            idx_permutation = np.array(idx_permutation)
            idx_train_new += list(df_name.index[idx_permutation[:n_train]])
            idx_test_new += list(df_name.index[idx_permutation[n_train:n_train+n_test]])
    return np.array(idx_train_new), np.array(idx_test_new)

Time-proportion split

Bases: TimeAwareSplit

Time-proportion non-random splitting method into training and testing sets.

For each individual, it extracts unique observation dates and puts half to the training to the testing set. Ignores individuals with only one observation date. Implementation of this paper.

Source code in wildlife_datasets/splits/time_aware_split.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class TimeProportionSplit(TimeAwareSplit):
    """Time-proportion non-random splitting method into training and testing sets.

    For each individual, it extracts unique observation dates
    and puts half to the training to the testing set.
    Ignores individuals with only one observation date.
    Implementation of [this paper](https://arxiv.org/abs/2211.10307).
    """

    def __init__(
            self,
            ratio: float = 0.5,
            seed: int = 666,
            identity_skip: str = 'unknown',
            ):
        """Initializes the class.

        Args:
            ratio (float, optional): The fraction of dates going to the testing set.
            seed (int, optional): Initial seed for the LCG random generator.
            identity_skip (str, optional): Name of the identities to ignore.
        """

        self.ratio = ratio
        self.identity_skip = identity_skip
        self.seed = seed

    def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

        Returns:
            List of splits. Each split is list of labels of the training and testing sets.
        """

        df = self.modify_df(df)
        idx_train = []
        idx_test = []
        # Loop over all identities; x is a tuple (identity, df with unique identity)
        for _, df_name in df.groupby('identity'):            
            dates = df_name.groupby('date')
            n_dates = len(dates)
            if n_dates > 1:
                # Loop over all dates; y is a tuple (date, df with unique date and identity)
                for i, (_, df_date) in enumerate(dates):
                    # Add half dates to the training and half to the testing set
                    if i < int(np.round(self.ratio*n_dates)):
                        idx_train += list(df_date.index)
                    else:
                        idx_test += list(df_date.index)
            else:
                idx_train += list(df_name.index)
        return [(np.array(idx_train), np.array(idx_test))]

__init__(ratio=0.5, seed=666, identity_skip='unknown')

Initializes the class.

Parameters:

Name Type Description Default
ratio float

The fraction of dates going to the testing set.

0.5
seed int

Initial seed for the LCG random generator.

666
identity_skip str

Name of the identities to ignore.

'unknown'
Source code in wildlife_datasets/splits/time_aware_split.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def __init__(
        self,
        ratio: float = 0.5,
        seed: int = 666,
        identity_skip: str = 'unknown',
        ):
    """Initializes the class.

    Args:
        ratio (float, optional): The fraction of dates going to the testing set.
        seed (int, optional): Initial seed for the LCG random generator.
        identity_skip (str, optional): Name of the identities to ignore.
    """

    self.ratio = ratio
    self.identity_skip = identity_skip
    self.seed = seed

split(df)

Implementation of the base splitting method.

Parameters:

Name Type Description Default
df DataFrame

A dataframe of the data. It must contain columns identity and date.

required

Returns:

Type Description
List[Tuple[ndarray, ndarray]]

List of splits. Each split is list of labels of the training and testing sets.

Source code in wildlife_datasets/splits/time_aware_split.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

    Returns:
        List of splits. Each split is list of labels of the training and testing sets.
    """

    df = self.modify_df(df)
    idx_train = []
    idx_test = []
    # Loop over all identities; x is a tuple (identity, df with unique identity)
    for _, df_name in df.groupby('identity'):            
        dates = df_name.groupby('date')
        n_dates = len(dates)
        if n_dates > 1:
            # Loop over all dates; y is a tuple (date, df with unique date and identity)
            for i, (_, df_date) in enumerate(dates):
                # Add half dates to the training and half to the testing set
                if i < int(np.round(self.ratio*n_dates)):
                    idx_train += list(df_date.index)
                else:
                    idx_test += list(df_date.index)
        else:
            idx_train += list(df_name.index)
    return [(np.array(idx_train), np.array(idx_test))]

Time-cutoff split

Bases: TimeAwareSplit

Time-cutoff non-random splitting method into training and testing sets.

Puts all individuals observed before year into the training test. Puts all individuals observed during year into the testing test. Ignores all individuals observed after year. Implementation of this paper.

Source code in wildlife_datasets/splits/time_aware_split.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
class TimeCutoffSplit(TimeAwareSplit):
    """Time-cutoff non-random splitting method into training and testing sets.

    Puts all individuals observed before `year` into the training test.
    Puts all individuals observed during `year` into the testing test.
    Ignores all individuals observed after `year`.
    Implementation of [this paper](https://arxiv.org/abs/2211.10307).
    """

    def __init__(
            self,
            year: int,
            test_one_year_only: bool = True,
            seed: int = 666,
            identity_skip: str = 'unknown',
            ) -> None:
        """Initializes the class.

        Args:
            year (int): Splitting year.
            test_one_year_only (bool, optional): Whether the test set is `df['year'] == year` or `df['year'] >= year`.
            seed (int, optional): Initial seed for the LCG random generator.            
            identity_skip (str, optional): Name of the identities to ignore.
        """

        self.year = year
        self.test_one_year_only = test_one_year_only
        self.identity_skip = identity_skip
        self.seed = seed

    def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

        Returns:
            List of labels of the training and testing sets.
        """

        df = self.modify_df(df)
        idx_train = list(np.where(df['year'] < self.year)[0])
        if self.test_one_year_only:
            idx_test = list(np.where(df['year'] == self.year)[0])
        else:
            idx_test = list(np.where(df['year'] >= self.year)[0])
        return [(np.array(df.index.values)[idx_train], np.array(df.index.values)[idx_test])]

__init__(year, test_one_year_only=True, seed=666, identity_skip='unknown')

Initializes the class.

Parameters:

Name Type Description Default
year int

Splitting year.

required
test_one_year_only bool

Whether the test set is df['year'] == year or df['year'] >= year.

True
seed int

Initial seed for the LCG random generator.

666
identity_skip str

Name of the identities to ignore.

'unknown'
Source code in wildlife_datasets/splits/time_aware_split.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def __init__(
        self,
        year: int,
        test_one_year_only: bool = True,
        seed: int = 666,
        identity_skip: str = 'unknown',
        ) -> None:
    """Initializes the class.

    Args:
        year (int): Splitting year.
        test_one_year_only (bool, optional): Whether the test set is `df['year'] == year` or `df['year'] >= year`.
        seed (int, optional): Initial seed for the LCG random generator.            
        identity_skip (str, optional): Name of the identities to ignore.
    """

    self.year = year
    self.test_one_year_only = test_one_year_only
    self.identity_skip = identity_skip
    self.seed = seed

split(df)

Implementation of the base splitting method.

Parameters:

Name Type Description Default
df DataFrame

A dataframe of the data. It must contain columns identity and date.

required

Returns:

Type Description
List[Tuple[ndarray, ndarray]]

List of labels of the training and testing sets.

Source code in wildlife_datasets/splits/time_aware_split.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

    Returns:
        List of labels of the training and testing sets.
    """

    df = self.modify_df(df)
    idx_train = list(np.where(df['year'] < self.year)[0])
    if self.test_one_year_only:
        idx_test = list(np.where(df['year'] == self.year)[0])
    else:
        idx_test = list(np.where(df['year'] >= self.year)[0])
    return [(np.array(df.index.values)[idx_train], np.array(df.index.values)[idx_test])]

Lcg

Linear congruential generator for generating random numbers.

Copied from StackOverflow. It is machine-, distribution- and package version-independent. It has some drawbacks (check the link above) but perfectly sufficient for our application.

Attributes:

Name Type Description
state int

Random state of the LCG.

Source code in wildlife_datasets/splits/lcg.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class Lcg():
    """Linear congruential generator for generating random numbers.

    Copied from [StackOverflow](https://stackoverflow.com/questions/18634079/glibc-rand-function-implementation).
    It is machine-, distribution- and package version-independent.
    It has some drawbacks (check the link above) but perfectly sufficient for our application.

    Attributes:
      state (int): Random state of the LCG.
    """

    def __init__(self, seed: int, iterate: int=0) -> None:
        """Initialization function for LCG.

        Args:
            seed (int): Initial random seed.
            iterate (int, optional): Number of initial random iterations.
        """
        self.state = seed
        for _ in range(iterate):
            self.random()

    def random(self) -> int:
        """Generates a new random integer from the current state.

        Returns:
            New random integer.
        """

        self.state = (self.state * 1103515245 + 12345) & 0x7FFFFFFF
        return self.state

    def random_permutation(self, n: int) -> np.ndarray:
        """Generates a random permutation of `range(n)`.

        Args:
            n (int): Length of the sequence to be permuted.

        Returns:
            Permuted sequence.
        """

        rnd = []
        for _ in range(n):
            self.random()
            rnd.append(self.state)
        return np.argsort(rnd)

    def random_shuffle(self, x: np.ndarray) -> np.ndarray:
        """Generates a random shuffle of `x`.

        Args:
            x (np.ndarray): Array to be permuted.

        Returns:
            Shuffled array.
        """

        return np.array(x)[self.random_permutation(len(x))]

__init__(seed, iterate=0)

Initialization function for LCG.

Parameters:

Name Type Description Default
seed int

Initial random seed.

required
iterate int

Number of initial random iterations.

0
Source code in wildlife_datasets/splits/lcg.py
16
17
18
19
20
21
22
23
24
25
def __init__(self, seed: int, iterate: int=0) -> None:
    """Initialization function for LCG.

    Args:
        seed (int): Initial random seed.
        iterate (int, optional): Number of initial random iterations.
    """
    self.state = seed
    for _ in range(iterate):
        self.random()

random()

Generates a new random integer from the current state.

Returns:

Type Description
int

New random integer.

Source code in wildlife_datasets/splits/lcg.py
27
28
29
30
31
32
33
34
35
def random(self) -> int:
    """Generates a new random integer from the current state.

    Returns:
        New random integer.
    """

    self.state = (self.state * 1103515245 + 12345) & 0x7FFFFFFF
    return self.state

random_permutation(n)

Generates a random permutation of range(n).

Parameters:

Name Type Description Default
n int

Length of the sequence to be permuted.

required

Returns:

Type Description
ndarray

Permuted sequence.

Source code in wildlife_datasets/splits/lcg.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def random_permutation(self, n: int) -> np.ndarray:
    """Generates a random permutation of `range(n)`.

    Args:
        n (int): Length of the sequence to be permuted.

    Returns:
        Permuted sequence.
    """

    rnd = []
    for _ in range(n):
        self.random()
        rnd.append(self.state)
    return np.argsort(rnd)

random_shuffle(x)

Generates a random shuffle of x.

Parameters:

Name Type Description Default
x ndarray

Array to be permuted.

required

Returns:

Type Description
ndarray

Shuffled array.

Source code in wildlife_datasets/splits/lcg.py
53
54
55
56
57
58
59
60
61
62
63
def random_shuffle(self, x: np.ndarray) -> np.ndarray:
    """Generates a random shuffle of `x`.

    Args:
        x (np.ndarray): Array to be permuted.

    Returns:
        Shuffled array.
    """

    return np.array(x)[self.random_permutation(len(x))]