Reference splitting functions

This file describes methods associated with dataset splitting.

Balanced split

Base class for splitting datasets into training and testing sets.

Implements methods from this paper. Its subclasses need to implement the split method. It should perform balanced splits separately for all classes. Its children are IdentitySplit and TimeAwareSplit. IdentitySplit has children ClosedSetSplit, OpenSetSplit and DisjointSetSplit. TimeAwareSplit has children TimeProportionSplit and TimeCutoffSplit.

Source code in wildlife_datasets/splits/balanced_split.py

class BalancedSplit():
    """Base class for splitting datasets into training and testing sets.

    Implements methods from [this paper](https://arxiv.org/abs/2211.10307).
    Its subclasses need to implement the `split` method.
    It should perform balanced splits separately for all classes.
    Its children are `IdentitySplit` and `TimeAwareSplit`.
    `IdentitySplit` has children `ClosedSetSplit`, `OpenSetSplit` and `DisjointSetSplit`.
    `TimeAwareSplit` has children `TimeProportionSplit` and `TimeCutoffSplit`.
    """

    def __init__(
            self,
            seed: int = 666,
            identity_skip: str = 'unknown',
            col_label: str = 'identity',
            disable_tqdm: bool = True,            
            ) -> None:

        self.seed = seed
        self.identity_skip = identity_skip
        self.col_label = col_label
        self.disable_tqdm = disable_tqdm

    def modify_df(self, df: pd.DataFrame) -> pd.DataFrame:
        """Prepares dataframe for splits.

        Removes identities specified in `self.identity_skip` (usually unknown identities).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

        Returns:
            Modified dataframe of the data.
        """

        df = df.copy()
        df = df[df[self.col_label] != self.identity_skip]
        return df

    def initialize_lcg(self) -> Lcg:
        """Returns the random number generator.

        Returns:
            The random number generator.
        """

        return Lcg(self.seed)

    def split(self, *args, **kwargs) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Splitting method which needs to be implemented by subclasses.

        It splits the dataframe `df` into labels `idx_train` and `idx_test`.
        The subdataset is obtained by `df.loc[idx_train]` (not `iloc`).

        Returns:
            List of splits. Each split is list of labels of the training and testing sets.
        """

        raise(NotImplementedError('Subclasses should implement this. \n You may want to use ClosedSetSplit instead of BalancedSplit.'))

    def resplit_random(
            self,
            df: pd.DataFrame,
            idx_train: np.ndarray,
            idx_test: np.ndarray
            ) -> Tuple[np.ndarray, np.ndarray]:
        """Creates a random re-split of an already existing split.

        The re-split mimics the split as the training set contains
        the same number of samples for EACH individual.
        The same goes for the testing set.

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.
            idx_train (np.ndarray): Labels of the training set.
            idx_test (np.ndarray): Labels of the testing set.

        Returns:
            List of labels of the training and testing sets.
        """

        df = self.modify_df(df)

        # Initialize the random number generator
        lcg = self.initialize_lcg()

        # Compute the number of samples for each individual in the training set
        counts_train = {}
        for name, df_name in df.loc[idx_train].groupby(self.col_label):
            counts_train[name] = len(df_name)
        # Compute the number of samples for each individual in the testing set
        counts_test = {}
        for name, df_name in df.loc[idx_test].groupby(self.col_label):
            counts_test[name] = len(df_name)

        idx_train_new = []
        idx_test_new = []
        # Loop over all individuals
        for name, df_name in df.groupby(self.col_label):
            # Extract the number of individuals in the training and testing sets
            n_train = counts_train.get(name, 0)
            n_test = counts_test.get(name, 0)
            if n_train+n_test > 0:
                if len(df_name) < n_train+n_test:
                    raise(Exception('The set is too small.'))
                # Get the correct number of indices in both sets
                idx_permutation = lcg.random_permutation(n_train+n_test)
                idx_permutation = np.array(idx_permutation)
                idx_train_new += list(df_name.index[idx_permutation[:n_train]])
                idx_test_new += list(df_name.index[idx_permutation[n_train:n_train+n_test]])
        return np.array(idx_train_new), np.array(idx_test_new)

    def resplit_by_features(
            self,
            df: pd.DataFrame,
            features: np.ndarray,
            idx_train: np.ndarray,
            save_clusters_prefix: Optional[str] = None,
            **kwargs,
            ) -> Tuple[np.ndarray, np.ndarray]:
        """Creates a random re-split of an already existing split.

        The re-split is based on similarity of features.
        It runs DBSCAN as described in `compute_clusters` and
        performs the clustering as described in `resplit_by_clusters`.

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
            features (np.ndarray): An array of features with the same length as `df`.
            idx_train (np.ndarray): Labels of the training set.
            save_clusters_prefix (Optional[bool], optional): File name prefix for saving clusters.
            **kwargs (type, optional): See kwargs in `compute_clusters`.

        Returns:
            List of labels of the training and testing sets.
        """        

        clusters = self.compute_clusters(df, features, **kwargs)
        if save_clusters_prefix is not None:
            np.save(f'{save_clusters_prefix}.npy', clusters)
        return self.resplit_by_clusters(df, clusters, idx_train)

    def compute_clusters(
            self,
            df: pd.DataFrame,
            features: np.ndarray,
            n_max_cluster: int = 5,
            eps_min: float = 0.01,
            eps_max: float = 0.50,
            eps_step: float = 0.01,
            min_samples: int = 2,
            ) -> np.ndarray:

        """Computes clusters for a random re-split of an already existing split.

        It runs DBSCAN with increasing eps (cluster radius) until
        the clusters are smaller than `n_max_cluster`.

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
            features (np.ndarray): An array of features with the same length as `df`.
            n_max_cluster (int, optional): Maximal size of cluster before `eps` stops increasing.
            eps_min (float, optional): Lower bound for epsilon.
            eps_max (float, optional): Upper bound for epsilon.
            eps_step (float, optional): Step for epsilon.
            min_samples (int, optional): Minimal cluster size.

        Returns:
            List of clusters.
        """

        df = self.modify_df(df)
        df['cluster'] = np.nan

        for _, df_identity in tqdm(df.groupby(self.col_label), disable=self.disable_tqdm):
            f = features[df.index.get_indexer(df_identity.index)]
            # Run DBScan with increasing eps until there are no clusters bigger than n_max_cluster 
            clusters_saved = None
            for eps in np.arange(eps_min, eps_max+eps_step, eps_step):
                clustering = DBSCAN(eps=eps, min_samples=min_samples)
                clustering.fit(f)
                clusters = pd.Series(clustering.labels_)
                clusters_counts = clusters.value_counts(sort=True)
                # Check if the largest clusters (without outliers) is not too big
                if clusters_counts.index[0] == -1:
                    clustering_failed = len(clusters_counts) > 1 and clusters_counts.iloc[1] > n_max_cluster
                else:
                    clustering_failed = len(clusters_counts) == 1 or clusters_counts.iloc[0] > n_max_cluster
                # If the largest cluster is not too big, save clustering nad continue
                if not clustering_failed:
                    clusters_saved = clusters
                else:
                    break

            # Save the clusters
            if clusters_saved is not None:
                clusters_saved[clusters_saved == -1] = np.nan
                df.loc[df_identity.index, 'cluster'] = clusters_saved.to_numpy()

        return df['cluster'].to_numpy()

    def resplit_by_clusters(
            self,
            df: pd.DataFrame,
            clusters: np.ndarray,
            idx_train: np.ndarray,
            ) -> Tuple[np.ndarray, np.ndarray]:

        """Creates a random re-split of an already existing split.

        The re-split is based on clusters which collect similar images.
        Then it puts of similar images into the training set.
        The rest is randomly split into training and testing sets.
        The re-split mimics the split as the training set contains
        the same number of samples for EACH individual.
        The same goes for the testing set.

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
            clusters (np.ndarray): An array of clusters with the same length as `df`.
            idx_train (np.ndarray): Labels of the training set.

        Returns:
            List of labels of the training and testing sets.
        """

        df = self.modify_df(df)

        # Replace clusters appearing just ones with np.nan
        clusters_unique, clusters_count = np.unique(clusters, return_counts=True)
        clusters[np.isin(clusters, clusters_unique[clusters_count == 1])] = np.nan
        df['cluster'] = clusters

        # Initialize the random number generator
        lcg = self.initialize_lcg()

        # Determine how many images of each individual should be in the training set
        identity_train_counts = df.loc[idx_train][self.col_label].value_counts()

        # Loop over individuals and create a split for each
        idx_train_new = []
        for identity, df_identity in tqdm(df.groupby(self.col_label), disable=self.disable_tqdm):
            n_train = identity_train_counts.get(identity, 0)
            if len(df_identity) - n_train <= 1:
                # All or all but one samples into the training set
                idx_remaining = np.array(df_identity.index)
                idx_remaining = lcg.random_shuffle(idx_remaining)
                idx_train_identity = idx_remaining[:n_train]
            else:
                # Add all the clusters into the training set
                idx_train_identity = []
                for _, df_cluster in df_identity.groupby('cluster'):
                    # Check if the training set is not too big
                    if len(idx_train_identity) + len(df_cluster) <= n_train:
                        idx_train_identity += list(df_cluster.index)

                # Distribute the remaining indices
                n_train_remaining = n_train - len(idx_train_identity)
                idx_remaining = self.setdiff(df_identity.index, idx_train_identity)
                idx_remaining = lcg.random_shuffle(idx_remaining)
                idx_train_identity += list(idx_remaining[:n_train_remaining])
            idx_train_new += list(idx_train_identity)
        idx_test_new = self.setdiff(df.index, idx_train_new)
        return np.array(idx_train_new), np.array(idx_test_new)

    def set_col_label(self, col_label: str) -> None:
        """Sets col_label to desired value

        Args:
            col_label (str): Desired value for col_label.
        """

        self.col_label = col_label

    def setdiff(self, a, b):
        a = np.array(a)
        b = np.array(b)
        return pd.unique(a[~np.in1d(a,b)])

`compute_clusters(df, features, n_max_cluster=5, eps_min=0.01, eps_max=0.5, eps_step=0.01, min_samples=2)`

Computes clusters for a random re-split of an already existing split.

It runs DBSCAN with increasing eps (cluster radius) until the clusters are smaller than n_max_cluster.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain column `identity`.	required
`features`	`ndarray`	An array of features with the same length as `df`.	required
`n_max_cluster`	`int`	Maximal size of cluster before `eps` stops increasing.	`5`
`eps_min`	`float`	Lower bound for epsilon.	`0.01`
`eps_max`	`float`	Upper bound for epsilon.	`0.5`
`eps_step`	`float`	Step for epsilon.	`0.01`
`min_samples`	`int`	Minimal cluster size.	`2`

Returns:

Type	Description
`ndarray`	List of clusters.

Source code in wildlife_datasets/splits/balanced_split.py

def compute_clusters(
        self,
        df: pd.DataFrame,
        features: np.ndarray,
        n_max_cluster: int = 5,
        eps_min: float = 0.01,
        eps_max: float = 0.50,
        eps_step: float = 0.01,
        min_samples: int = 2,
        ) -> np.ndarray:

    """Computes clusters for a random re-split of an already existing split.

    It runs DBSCAN with increasing eps (cluster radius) until
    the clusters are smaller than `n_max_cluster`.

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
        features (np.ndarray): An array of features with the same length as `df`.
        n_max_cluster (int, optional): Maximal size of cluster before `eps` stops increasing.
        eps_min (float, optional): Lower bound for epsilon.
        eps_max (float, optional): Upper bound for epsilon.
        eps_step (float, optional): Step for epsilon.
        min_samples (int, optional): Minimal cluster size.

    Returns:
        List of clusters.
    """

    df = self.modify_df(df)
    df['cluster'] = np.nan

    for _, df_identity in tqdm(df.groupby(self.col_label), disable=self.disable_tqdm):
        f = features[df.index.get_indexer(df_identity.index)]
        # Run DBScan with increasing eps until there are no clusters bigger than n_max_cluster 
        clusters_saved = None
        for eps in np.arange(eps_min, eps_max+eps_step, eps_step):
            clustering = DBSCAN(eps=eps, min_samples=min_samples)
            clustering.fit(f)
            clusters = pd.Series(clustering.labels_)
            clusters_counts = clusters.value_counts(sort=True)
            # Check if the largest clusters (without outliers) is not too big
            if clusters_counts.index[0] == -1:
                clustering_failed = len(clusters_counts) > 1 and clusters_counts.iloc[1] > n_max_cluster
            else:
                clustering_failed = len(clusters_counts) == 1 or clusters_counts.iloc[0] > n_max_cluster
            # If the largest cluster is not too big, save clustering nad continue
            if not clustering_failed:
                clusters_saved = clusters
            else:
                break

        # Save the clusters
        if clusters_saved is not None:
            clusters_saved[clusters_saved == -1] = np.nan
            df.loc[df_identity.index, 'cluster'] = clusters_saved.to_numpy()

    return df['cluster'].to_numpy()

`initialize_lcg()`

Returns the random number generator.

Returns:

Type	Description
`Lcg`	The random number generator.

Source code in wildlife_datasets/splits/balanced_split.py

def initialize_lcg(self) -> Lcg:
    """Returns the random number generator.

    Returns:
        The random number generator.
    """

    return Lcg(self.seed)

`modify_df(df)`

Prepares dataframe for splits.

Removes identities specified in self.identity_skip (usually unknown identities).

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain columns `identity` and `date`.	required

Returns:

Type	Description
`DataFrame`	Modified dataframe of the data.

Source code in wildlife_datasets/splits/balanced_split.py

def modify_df(self, df: pd.DataFrame) -> pd.DataFrame:
    """Prepares dataframe for splits.

    Removes identities specified in `self.identity_skip` (usually unknown identities).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

    Returns:
        Modified dataframe of the data.
    """

    df = df.copy()
    df = df[df[self.col_label] != self.identity_skip]
    return df

`resplit_by_clusters(df, clusters, idx_train)`

Creates a random re-split of an already existing split.

The re-split is based on clusters which collect similar images. Then it puts of similar images into the training set. The rest is randomly split into training and testing sets. The re-split mimics the split as the training set contains the same number of samples for EACH individual. The same goes for the testing set.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain column `identity`.	required
`clusters`	`ndarray`	An array of clusters with the same length as `df`.	required
`idx_train`	`ndarray`	Labels of the training set.	required

Returns:

Type	Description
`Tuple[ndarray, ndarray]`	List of labels of the training and testing sets.

Source code in wildlife_datasets/splits/balanced_split.py

def resplit_by_clusters(
        self,
        df: pd.DataFrame,
        clusters: np.ndarray,
        idx_train: np.ndarray,
        ) -> Tuple[np.ndarray, np.ndarray]:

    """Creates a random re-split of an already existing split.

    The re-split is based on clusters which collect similar images.
    Then it puts of similar images into the training set.
    The rest is randomly split into training and testing sets.
    The re-split mimics the split as the training set contains
    the same number of samples for EACH individual.
    The same goes for the testing set.

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
        clusters (np.ndarray): An array of clusters with the same length as `df`.
        idx_train (np.ndarray): Labels of the training set.

    Returns:
        List of labels of the training and testing sets.
    """

    df = self.modify_df(df)

    # Replace clusters appearing just ones with np.nan
    clusters_unique, clusters_count = np.unique(clusters, return_counts=True)
    clusters[np.isin(clusters, clusters_unique[clusters_count == 1])] = np.nan
    df['cluster'] = clusters

    # Initialize the random number generator
    lcg = self.initialize_lcg()

    # Determine how many images of each individual should be in the training set
    identity_train_counts = df.loc[idx_train][self.col_label].value_counts()

    # Loop over individuals and create a split for each
    idx_train_new = []
    for identity, df_identity in tqdm(df.groupby(self.col_label), disable=self.disable_tqdm):
        n_train = identity_train_counts.get(identity, 0)
        if len(df_identity) - n_train <= 1:
            # All or all but one samples into the training set
            idx_remaining = np.array(df_identity.index)
            idx_remaining = lcg.random_shuffle(idx_remaining)
            idx_train_identity = idx_remaining[:n_train]
        else:
            # Add all the clusters into the training set
            idx_train_identity = []
            for _, df_cluster in df_identity.groupby('cluster'):
                # Check if the training set is not too big
                if len(idx_train_identity) + len(df_cluster) <= n_train:
                    idx_train_identity += list(df_cluster.index)

            # Distribute the remaining indices
            n_train_remaining = n_train - len(idx_train_identity)
            idx_remaining = self.setdiff(df_identity.index, idx_train_identity)
            idx_remaining = lcg.random_shuffle(idx_remaining)
            idx_train_identity += list(idx_remaining[:n_train_remaining])
        idx_train_new += list(idx_train_identity)
    idx_test_new = self.setdiff(df.index, idx_train_new)
    return np.array(idx_train_new), np.array(idx_test_new)

`resplit_by_features(df, features, idx_train, save_clusters_prefix=None, **kwargs)`

Creates a random re-split of an already existing split.

The re-split is based on similarity of features. It runs DBSCAN as described in compute_clusters and performs the clustering as described in resplit_by_clusters.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain column `identity`.	required
`features`	`ndarray`	An array of features with the same length as `df`.	required
`idx_train`	`ndarray`	Labels of the training set.	required
`save_clusters_prefix`	`Optional[bool]`	File name prefix for saving clusters.	`None`
`**kwargs`	`type`	See kwargs in `compute_clusters`.	`{}`

Returns:

Type	Description
`Tuple[ndarray, ndarray]`	List of labels of the training and testing sets.

Source code in wildlife_datasets/splits/balanced_split.py

def resplit_by_features(
        self,
        df: pd.DataFrame,
        features: np.ndarray,
        idx_train: np.ndarray,
        save_clusters_prefix: Optional[str] = None,
        **kwargs,
        ) -> Tuple[np.ndarray, np.ndarray]:
    """Creates a random re-split of an already existing split.

    The re-split is based on similarity of features.
    It runs DBSCAN as described in `compute_clusters` and
    performs the clustering as described in `resplit_by_clusters`.

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
        features (np.ndarray): An array of features with the same length as `df`.
        idx_train (np.ndarray): Labels of the training set.
        save_clusters_prefix (Optional[bool], optional): File name prefix for saving clusters.
        **kwargs (type, optional): See kwargs in `compute_clusters`.

    Returns:
        List of labels of the training and testing sets.
    """        

    clusters = self.compute_clusters(df, features, **kwargs)
    if save_clusters_prefix is not None:
        np.save(f'{save_clusters_prefix}.npy', clusters)
    return self.resplit_by_clusters(df, clusters, idx_train)

`resplit_random(df, idx_train, idx_test)`

Creates a random re-split of an already existing split.

The re-split mimics the split as the training set contains the same number of samples for EACH individual. The same goes for the testing set.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain columns `identity` and `date`.	required
`idx_train`	`ndarray`	Labels of the training set.	required
`idx_test`	`ndarray`	Labels of the testing set.	required

Returns:

Type	Description
`Tuple[ndarray, ndarray]`	List of labels of the training and testing sets.

Source code in wildlife_datasets/splits/balanced_split.py

def resplit_random(
        self,
        df: pd.DataFrame,
        idx_train: np.ndarray,
        idx_test: np.ndarray
        ) -> Tuple[np.ndarray, np.ndarray]:
    """Creates a random re-split of an already existing split.

    The re-split mimics the split as the training set contains
    the same number of samples for EACH individual.
    The same goes for the testing set.

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.
        idx_train (np.ndarray): Labels of the training set.
        idx_test (np.ndarray): Labels of the testing set.

    Returns:
        List of labels of the training and testing sets.
    """

    df = self.modify_df(df)

    # Initialize the random number generator
    lcg = self.initialize_lcg()

    # Compute the number of samples for each individual in the training set
    counts_train = {}
    for name, df_name in df.loc[idx_train].groupby(self.col_label):
        counts_train[name] = len(df_name)
    # Compute the number of samples for each individual in the testing set
    counts_test = {}
    for name, df_name in df.loc[idx_test].groupby(self.col_label):
        counts_test[name] = len(df_name)

    idx_train_new = []
    idx_test_new = []
    # Loop over all individuals
    for name, df_name in df.groupby(self.col_label):
        # Extract the number of individuals in the training and testing sets
        n_train = counts_train.get(name, 0)
        n_test = counts_test.get(name, 0)
        if n_train+n_test > 0:
            if len(df_name) < n_train+n_test:
                raise(Exception('The set is too small.'))
            # Get the correct number of indices in both sets
            idx_permutation = lcg.random_permutation(n_train+n_test)
            idx_permutation = np.array(idx_permutation)
            idx_train_new += list(df_name.index[idx_permutation[:n_train]])
            idx_test_new += list(df_name.index[idx_permutation[n_train:n_train+n_test]])
    return np.array(idx_train_new), np.array(idx_test_new)

`set_col_label(col_label)`

Sets col_label to desired value

Parameters:

Name	Type	Description	Default
`col_label`	`str`	Desired value for col_label.	required

Source code in wildlife_datasets/splits/balanced_split.py

def set_col_label(self, col_label: str) -> None:
    """Sets col_label to desired value

    Args:
        col_label (str): Desired value for col_label.
    """

    self.col_label = col_label

`split(*args, **kwargs)`

Splitting method which needs to be implemented by subclasses.

It splits the dataframe df into labels idx_train and idx_test. The subdataset is obtained by df.loc[idx_train] (not iloc).

Returns:

Type	Description
`List[Tuple[ndarray, ndarray]]`	List of splits. Each split is list of labels of the training and testing sets.

Source code in wildlife_datasets/splits/balanced_split.py

def split(self, *args, **kwargs) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Splitting method which needs to be implemented by subclasses.

    It splits the dataframe `df` into labels `idx_train` and `idx_test`.
    The subdataset is obtained by `df.loc[idx_train]` (not `iloc`).

    Returns:
        List of splits. Each split is list of labels of the training and testing sets.
    """

    raise(NotImplementedError('Subclasses should implement this. \n You may want to use ClosedSetSplit instead of BalancedSplit.'))

Identity split

Bases: BalancedSplit

Base class for ClosedSetSplit, OpenSetSplit and DisjointSetSplit.

Source code in wildlife_datasets/splits/identity_split.py

class IdentitySplit(BalancedSplit):
    """Base class for `ClosedSetSplit`, `OpenSetSplit` and `DisjointSetSplit`.
    """

    def general_split(
            self,
            df: pd.DataFrame,
            individual_train: List[str],
            individual_test: List[str],
            ) -> Tuple[np.ndarray, np.ndarray]:
        """General-purpose split into the training and testing sets.

        It puts all samples of `individual_train` into the training set
        and all samples of `individual_test` into the testing set.
        The splitting is performed for each individual separately.
        The split will result in at least one sample in both the training and testing sets.
        If only one sample is available for an individual, it will be in the training set.

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
            individual_train (List[str]): Individuals to be only in the training test.
            individual_test (List[str]): Individuals to be only in the testing test.

        Returns:
            List of labels of the training and testing sets.
        """

        # Initialize the random number generator
        lcg = self.initialize_lcg()

        # Compute how many samples go automatically to the training and testing sets
        y_counts = df[self.col_label].value_counts()
        n_train = sum([y_counts.loc[y] for y in individual_train])
        n_test = sum([y_counts.loc[y] for y in individual_test])

        # Recompute ratio_train and adjust it to proper bounds
        n = len(df)
        ratio_train = self.ratio_train
        if n_train + n_test > 0 and n_train + n_test < n:
            ratio_train = (n*ratio_train - n_train) / (n - n_test - n_train)
        ratio_train = np.clip(ratio_train, 0, 1)

        idx_train = []
        idx_test = []        
        # Make a loop over all individuals
        for individual, df_individual in df.groupby(self.col_label):
            if individual in individual_train and individual in individual_test:
                # Check if the class does not belong to both sets
                raise(Exception('Individual cannot be both in individual_train and individual_test.'))
            elif individual in individual_train:
                # Check if the class does not belong to the training set
                idx_train += list(df_individual.index)
            elif individual in individual_test:
                # Check if the class does not belong to the testing set
                idx_test += list(df_individual.index)
            else:
                # Otherwise compute the number of samples in the training set
                n_individual = len(df_individual)
                n_train = np.round(ratio_train * n_individual).astype(int)
                if n_train == n_individual and n_train > 1:
                    n_train -= 1
                if n_train == 0:
                    n_train = 1
                # Create indices to the training set and randomly permute them                
                idx_permutation = lcg.random_permutation(n_individual)
                idx_permutation = np.array(idx_permutation)
                idx_train += list(df_individual.index[idx_permutation[:n_train]])
                idx_test += list(df_individual.index[idx_permutation[n_train:]])
        return np.array(idx_train), np.array(idx_test)

`general_split(df, individual_train, individual_test)`

General-purpose split into the training and testing sets.

It puts all samples of individual_train into the training set and all samples of individual_test into the testing set. The splitting is performed for each individual separately. The split will result in at least one sample in both the training and testing sets. If only one sample is available for an individual, it will be in the training set.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain column `identity`.	required
`individual_train`	`List[str]`	Individuals to be only in the training test.	required
`individual_test`	`List[str]`	Individuals to be only in the testing test.	required

Returns:

Type	Description
`Tuple[ndarray, ndarray]`	List of labels of the training and testing sets.

Source code in wildlife_datasets/splits/identity_split.py

def general_split(
        self,
        df: pd.DataFrame,
        individual_train: List[str],
        individual_test: List[str],
        ) -> Tuple[np.ndarray, np.ndarray]:
    """General-purpose split into the training and testing sets.

    It puts all samples of `individual_train` into the training set
    and all samples of `individual_test` into the testing set.
    The splitting is performed for each individual separately.
    The split will result in at least one sample in both the training and testing sets.
    If only one sample is available for an individual, it will be in the training set.

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.
        individual_train (List[str]): Individuals to be only in the training test.
        individual_test (List[str]): Individuals to be only in the testing test.

    Returns:
        List of labels of the training and testing sets.
    """

    # Initialize the random number generator
    lcg = self.initialize_lcg()

    # Compute how many samples go automatically to the training and testing sets
    y_counts = df[self.col_label].value_counts()
    n_train = sum([y_counts.loc[y] for y in individual_train])
    n_test = sum([y_counts.loc[y] for y in individual_test])

    # Recompute ratio_train and adjust it to proper bounds
    n = len(df)
    ratio_train = self.ratio_train
    if n_train + n_test > 0 and n_train + n_test < n:
        ratio_train = (n*ratio_train - n_train) / (n - n_test - n_train)
    ratio_train = np.clip(ratio_train, 0, 1)

    idx_train = []
    idx_test = []        
    # Make a loop over all individuals
    for individual, df_individual in df.groupby(self.col_label):
        if individual in individual_train and individual in individual_test:
            # Check if the class does not belong to both sets
            raise(Exception('Individual cannot be both in individual_train and individual_test.'))
        elif individual in individual_train:
            # Check if the class does not belong to the training set
            idx_train += list(df_individual.index)
        elif individual in individual_test:
            # Check if the class does not belong to the testing set
            idx_test += list(df_individual.index)
        else:
            # Otherwise compute the number of samples in the training set
            n_individual = len(df_individual)
            n_train = np.round(ratio_train * n_individual).astype(int)
            if n_train == n_individual and n_train > 1:
                n_train -= 1
            if n_train == 0:
                n_train = 1
            # Create indices to the training set and randomly permute them                
            idx_permutation = lcg.random_permutation(n_individual)
            idx_permutation = np.array(idx_permutation)
            idx_train += list(df_individual.index[idx_permutation[:n_train]])
            idx_test += list(df_individual.index[idx_permutation[n_train:]])
    return np.array(idx_train), np.array(idx_test)

Closed-set split

Bases: IdentitySplit

Closed-set splitting method into training and testing sets.

All individuals are both in the training and testing set. The only exception is that individuals with only one sample are in the training set. Implementation of this paper.

Source code in wildlife_datasets/splits/identity_split.py

class ClosedSetSplit(IdentitySplit):
    """Closed-set splitting method into training and testing sets.

    All individuals are both in the training and testing set.
    The only exception is that individuals with only one sample are in the training set.
    Implementation of [this paper](https://arxiv.org/abs/2211.10307).
    """

    def __init__(
            self,
            ratio_train: float,
            **kwargs
            ) -> None:
        """Initializes the class.

        Args:
            ratio_train (float): *Approximate* size of the training set.
            **kwargs (type, optional): See kwargs `seed`, `identity_skip` and `col_label` of the parent class.
        """

        self.ratio_train = ratio_train
        super().__init__(**kwargs)

    def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

        Returns:
            List of splits. Each split is list of labels of the training and testing sets.
        """

        df = self.modify_df(df)
        individual_train = np.array([], dtype=object)
        individual_test = np.array([], dtype=object)
        return [self.general_split(df, individual_train, individual_test)]

`init(ratio_train, **kwargs)`

Initializes the class.

Parameters:

Name	Type	Description	Default
`ratio_train`	`float`	Approximate size of the training set.	required
`**kwargs`	`type`	See kwargs `seed`, `identity_skip` and `col_label` of the parent class.	`{}`

Source code in wildlife_datasets/splits/identity_split.py

def __init__(
        self,
        ratio_train: float,
        **kwargs
        ) -> None:
    """Initializes the class.

    Args:
        ratio_train (float): *Approximate* size of the training set.
        **kwargs (type, optional): See kwargs `seed`, `identity_skip` and `col_label` of the parent class.
    """

    self.ratio_train = ratio_train
    super().__init__(**kwargs)

`split(df)`

Implementation of the base splitting method.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain column `identity`.	required

Returns:

Type	Description
`List[Tuple[ndarray, ndarray]]`	List of splits. Each split is list of labels of the training and testing sets.

Source code in wildlife_datasets/splits/identity_split.py

def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

    Returns:
        List of splits. Each split is list of labels of the training and testing sets.
    """

    df = self.modify_df(df)
    individual_train = np.array([], dtype=object)
    individual_test = np.array([], dtype=object)
    return [self.general_split(df, individual_train, individual_test)]

Open-set split

Bases: IdentitySplit

Open-set splitting method into training and testing sets.

Some individuals are in the testing but not in the training set. Implementation of this paper.

Source code in wildlife_datasets/splits/identity_split.py

class OpenSetSplit(IdentitySplit):
    """Open-set splitting method into training and testing sets.

    Some individuals are in the testing but not in the training set.
    Implementation of [this paper](https://arxiv.org/abs/2211.10307).
    """

    def __init__(
            self,
            ratio_train: float,
            ratio_class_test: float = None,
            n_class_test: int = None,
            open_in_test: bool = True,
            **kwargs
            ) -> None:
        """Initializes the class.

        The user must provide exactly one from `ratio_class_test` and `n_class_test`.
        The latter specifies the number of individuals to be only in the testing set.
        The former specified the ratio of samples of individuals (not individuals themselves)
        to be only in the testing set.

        Args:
            ratio_train (float): *Approximate* size of the training set.
            ratio_class_test (float, optional): *Approximate* ratio of samples of individuals only in the testing set.
            n_class_test (int, optional): Number of individuals only in the testing set.
            open_in_test (str, optional): Whether the unique identifies will be in test (default) or train set.
            **kwargs (type, optional): See kwargs `seed`, `identity_skip` and `col_label` of the parent class.
        """

        if ratio_class_test is None and n_class_test is None:
            raise(Exception('Either ratio_class_test or n_class_test must be provided.'))
        elif ratio_class_test is not None and n_class_test is not None:
            raise(Exception('Only ratio_class_test or n_class_test can be provided.'))

        self.ratio_train = ratio_train
        self.ratio_class_test = ratio_class_test
        self.n_class_test = n_class_test
        self.open_in_test = open_in_test
        super().__init__(**kwargs)

    def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

        Returns:
            List of splits. Each split is list of labels of the training and testing sets.
        """

        df = self.modify_df(df)

        # Initialize the random number generator
        lcg = self.initialize_lcg()

        # Compute the counts and randomly permute them
        y_counts = df[self.col_label].value_counts()
        n_class = len(y_counts)
        idx = lcg.random_permutation(n_class)
        y_counts = y_counts.iloc[idx]

        # Compute number of identities in the testing set
        n = len(df)
        if self.n_class_test is None:
            n_test = np.round(n*self.ratio_class_test).astype(int)
            n_class_test = np.where(np.cumsum(y_counts) >= n_test)[0][0] + 1
        else:
            n_class_test = self.n_class_test

        # Specify individuals going purely into training and testing sets
        if self.open_in_test:
            individual_train = np.array([], dtype=object)
            individual_test = np.array(y_counts.index[:n_class_test])
        else:
            individual_train = np.array(y_counts.index[:n_class_test])
            individual_test = np.array([], dtype=object)
        return [self.general_split(df, individual_train, individual_test)]

`init(ratio_train, ratio_class_test=None, n_class_test=None, open_in_test=True, **kwargs)`

Initializes the class.

The user must provide exactly one from ratio_class_test and n_class_test. The latter specifies the number of individuals to be only in the testing set. The former specified the ratio of samples of individuals (not individuals themselves) to be only in the testing set.

Parameters:

Name	Type	Description	Default
`ratio_train`	`float`	Approximate size of the training set.	required
`ratio_class_test`	`float`	Approximate ratio of samples of individuals only in the testing set.	`None`
`n_class_test`	`int`	Number of individuals only in the testing set.	`None`
`open_in_test`	`str`	Whether the unique identifies will be in test (default) or train set.	`True`
`**kwargs`	`type`	See kwargs `seed`, `identity_skip` and `col_label` of the parent class.	`{}`

Source code in wildlife_datasets/splits/identity_split.py

def __init__(
        self,
        ratio_train: float,
        ratio_class_test: float = None,
        n_class_test: int = None,
        open_in_test: bool = True,
        **kwargs
        ) -> None:
    """Initializes the class.

    The user must provide exactly one from `ratio_class_test` and `n_class_test`.
    The latter specifies the number of individuals to be only in the testing set.
    The former specified the ratio of samples of individuals (not individuals themselves)
    to be only in the testing set.

    Args:
        ratio_train (float): *Approximate* size of the training set.
        ratio_class_test (float, optional): *Approximate* ratio of samples of individuals only in the testing set.
        n_class_test (int, optional): Number of individuals only in the testing set.
        open_in_test (str, optional): Whether the unique identifies will be in test (default) or train set.
        **kwargs (type, optional): See kwargs `seed`, `identity_skip` and `col_label` of the parent class.
    """

    if ratio_class_test is None and n_class_test is None:
        raise(Exception('Either ratio_class_test or n_class_test must be provided.'))
    elif ratio_class_test is not None and n_class_test is not None:
        raise(Exception('Only ratio_class_test or n_class_test can be provided.'))

    self.ratio_train = ratio_train
    self.ratio_class_test = ratio_class_test
    self.n_class_test = n_class_test
    self.open_in_test = open_in_test
    super().__init__(**kwargs)

`split(df)`

Implementation of the base splitting method.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain column `identity`.	required

Returns:

Type	Description
`List[Tuple[ndarray, ndarray]]`	List of splits. Each split is list of labels of the training and testing sets.

Source code in wildlife_datasets/splits/identity_split.py

def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

    Returns:
        List of splits. Each split is list of labels of the training and testing sets.
    """

    df = self.modify_df(df)

    # Initialize the random number generator
    lcg = self.initialize_lcg()

    # Compute the counts and randomly permute them
    y_counts = df[self.col_label].value_counts()
    n_class = len(y_counts)
    idx = lcg.random_permutation(n_class)
    y_counts = y_counts.iloc[idx]

    # Compute number of identities in the testing set
    n = len(df)
    if self.n_class_test is None:
        n_test = np.round(n*self.ratio_class_test).astype(int)
        n_class_test = np.where(np.cumsum(y_counts) >= n_test)[0][0] + 1
    else:
        n_class_test = self.n_class_test

    # Specify individuals going purely into training and testing sets
    if self.open_in_test:
        individual_train = np.array([], dtype=object)
        individual_test = np.array(y_counts.index[:n_class_test])
    else:
        individual_train = np.array(y_counts.index[:n_class_test])
        individual_test = np.array([], dtype=object)
    return [self.general_split(df, individual_train, individual_test)]

Disjoint-set split

Bases: IdentitySplit

Disjoint-set splitting method into training and testing sets.

No individuals are in both the training and testing sets. Implementation of this paper.

Source code in wildlife_datasets/splits/identity_split.py

class DisjointSetSplit(IdentitySplit):
    """Disjoint-set splitting method into training and testing sets.

    No individuals are in both the training and testing sets.
    Implementation of [this paper](https://arxiv.org/abs/2211.10307).
    """

    def __init__(
            self,
            ratio_class_test: float = None,
            n_class_test: int = None,
            **kwargs
            ) -> None:
        """Initializes the class.

        The user must provide exactly one from `ratio_class_test` and `n_class_test`.
        The latter specifies the number of individuals to be only in the testing set.
        The former specified the ratio of samples of individuals (not individuals themselves)
        to be only in the testing set.

        Args:
            ratio_class_test (float, optional): *Approximate* ratio of samples of individuals only in the testing set.
            n_class_test (int, optional): Number of individuals only in the testing set.
            **kwargs (type, optional): See kwargs `seed`, `identity_skip` and `col_label` of the parent class.
        """

        if ratio_class_test is None and n_class_test is None:
            raise(Exception('Either ratio_class_test or n_class_test must be provided.'))
        elif ratio_class_test is not None and n_class_test is not None:
            raise(Exception('Only ratio_class_test or n_class_test can be provided.'))

        self.ratio_train = 0 # Arbitrary value
        self.ratio_class_test = ratio_class_test
        self.n_class_test = n_class_test
        super().__init__(**kwargs)

    def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

        Returns:
            List of splits. Each split is list of labels of the training and testing sets.
        """

        df = self.modify_df(df)

        # Initialize the random number generator
        lcg = self.initialize_lcg()

        # Compute the counts and randomly permute them
        y_counts = df[self.col_label].value_counts()
        n_class = len(y_counts)
        idx = lcg.random_permutation(n_class)
        y_counts = y_counts.iloc[idx]

        # Compute number of identities in the testing set
        n = len(df)
        if self.n_class_test is None:
            n_test = np.round(n*self.ratio_class_test).astype(int)
            n_class_test = np.where(np.cumsum(y_counts) >= n_test)[0][0] + 1
        else:
            n_class_test = self.n_class_test

        # Specify individuals going purely into training and testing sets
        individual_train = np.array(y_counts.index[n_class_test:])
        individual_test = np.array(y_counts.index[:n_class_test])
        return [self.general_split(df, individual_train, individual_test)]

`init(ratio_class_test=None, n_class_test=None, **kwargs)`

Initializes the class.

The user must provide exactly one from ratio_class_test and n_class_test. The latter specifies the number of individuals to be only in the testing set. The former specified the ratio of samples of individuals (not individuals themselves) to be only in the testing set.

Parameters:

Name	Type	Description	Default
`ratio_class_test`	`float`	Approximate ratio of samples of individuals only in the testing set.	`None`
`n_class_test`	`int`	Number of individuals only in the testing set.	`None`
`**kwargs`	`type`	See kwargs `seed`, `identity_skip` and `col_label` of the parent class.	`{}`

Source code in wildlife_datasets/splits/identity_split.py

def __init__(
        self,
        ratio_class_test: float = None,
        n_class_test: int = None,
        **kwargs
        ) -> None:
    """Initializes the class.

    The user must provide exactly one from `ratio_class_test` and `n_class_test`.
    The latter specifies the number of individuals to be only in the testing set.
    The former specified the ratio of samples of individuals (not individuals themselves)
    to be only in the testing set.

    Args:
        ratio_class_test (float, optional): *Approximate* ratio of samples of individuals only in the testing set.
        n_class_test (int, optional): Number of individuals only in the testing set.
        **kwargs (type, optional): See kwargs `seed`, `identity_skip` and `col_label` of the parent class.
    """

    if ratio_class_test is None and n_class_test is None:
        raise(Exception('Either ratio_class_test or n_class_test must be provided.'))
    elif ratio_class_test is not None and n_class_test is not None:
        raise(Exception('Only ratio_class_test or n_class_test can be provided.'))

    self.ratio_train = 0 # Arbitrary value
    self.ratio_class_test = ratio_class_test
    self.n_class_test = n_class_test
    super().__init__(**kwargs)

`split(df)`

Implementation of the base splitting method.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain column `identity`.	required

Returns:

Type	Description
`List[Tuple[ndarray, ndarray]]`	List of splits. Each split is list of labels of the training and testing sets.

Source code in wildlife_datasets/splits/identity_split.py

def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain column `identity`.

    Returns:
        List of splits. Each split is list of labels of the training and testing sets.
    """

    df = self.modify_df(df)

    # Initialize the random number generator
    lcg = self.initialize_lcg()

    # Compute the counts and randomly permute them
    y_counts = df[self.col_label].value_counts()
    n_class = len(y_counts)
    idx = lcg.random_permutation(n_class)
    y_counts = y_counts.iloc[idx]

    # Compute number of identities in the testing set
    n = len(df)
    if self.n_class_test is None:
        n_test = np.round(n*self.ratio_class_test).astype(int)
        n_class_test = np.where(np.cumsum(y_counts) >= n_test)[0][0] + 1
    else:
        n_class_test = self.n_class_test

    # Specify individuals going purely into training and testing sets
    individual_train = np.array(y_counts.index[n_class_test:])
    individual_test = np.array(y_counts.index[:n_class_test])
    return [self.general_split(df, individual_train, individual_test)]

Time-aware split

Bases: BalancedSplit

Base class for TimeProportionSplit and TimeCutoffSplit.

Source code in wildlife_datasets/splits/time_aware_split.py

class TimeAwareSplit(BalancedSplit):
    """Base class for `TimeProportionSplit` and `TimeCutoffSplit`.
    """

    def modify_df(self, df: pd.DataFrame) -> pd.DataFrame:
        """Prepares dataframe for splits.

        Removes identities specified in `self.identity_skip` (usually unknown identities).
        Convert the `date` column into a unified format.
        Add the `year` column.

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

        Returns:
            Modified dataframe of the data.
        """

        # Check if the DataFrame contain the column date.
        if 'date' not in df.columns:
            raise(Exception('Dataframe df does not contain column date.'))

        # Remove identities to be skipped
        df = df.copy()
        df = df[df[self.col_label] != self.identity_skip]

        # Removes entries without dates
        df = df[~df['date'].isnull()]

        # Convert date to datetime format (from possibly strings) and drop hours
        df['date'] = pd.to_datetime(df['date']).apply(lambda x: x.date())
        df['year'] = df['date'].apply(lambda x: x.year).to_numpy()            
        return df

`modify_df(df)`

Prepares dataframe for splits.

Removes identities specified in self.identity_skip (usually unknown identities). Convert the date column into a unified format. Add the year column.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain columns `identity` and `date`.	required

Returns:

Type	Description
`DataFrame`	Modified dataframe of the data.

Source code in wildlife_datasets/splits/time_aware_split.py

def modify_df(self, df: pd.DataFrame) -> pd.DataFrame:
    """Prepares dataframe for splits.

    Removes identities specified in `self.identity_skip` (usually unknown identities).
    Convert the `date` column into a unified format.
    Add the `year` column.

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

    Returns:
        Modified dataframe of the data.
    """

    # Check if the DataFrame contain the column date.
    if 'date' not in df.columns:
        raise(Exception('Dataframe df does not contain column date.'))

    # Remove identities to be skipped
    df = df.copy()
    df = df[df[self.col_label] != self.identity_skip]

    # Removes entries without dates
    df = df[~df['date'].isnull()]

    # Convert date to datetime format (from possibly strings) and drop hours
    df['date'] = pd.to_datetime(df['date']).apply(lambda x: x.date())
    df['year'] = df['date'].apply(lambda x: x.year).to_numpy()            
    return df

Time-proportion split

Bases: TimeAwareSplit

Time-proportion non-random splitting method into training and testing sets.

For each individual, it extracts unique observation dates and puts half to the training to the testing set. Ignores individuals with only one observation date. Implementation of this paper.

Source code in wildlife_datasets/splits/time_aware_split.py

class TimeProportionSplit(TimeAwareSplit):
    """Time-proportion non-random splitting method into training and testing sets.

    For each individual, it extracts unique observation dates
    and puts half to the training to the testing set.
    Ignores individuals with only one observation date.
    Implementation of [this paper](https://arxiv.org/abs/2211.10307).
    """

    def __init__(
            self,
            ratio: float = 0.5,
            **kwargs
            ):
        """Initializes the class.

        Args:
            ratio (float, optional): The fraction of dates going to the training set.
            **kwargs (type, optional): See kwargs `seed`, `identity_skip` and `col_label` of the parent class.
        """

        self.ratio = ratio
        super().__init__(**kwargs)

    def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

        Returns:
            List of splits. Each split is list of labels of the training and testing sets.
        """

        df = self.modify_df(df)
        idx_train = []
        idx_test = []
        # Loop over all identities; x is a tuple (identity, df with unique identity)
        for _, df_name in df.groupby(self.col_label):            
            dates = df_name.groupby('date')
            n_dates = len(dates)
            if n_dates > 1:
                # Loop over all dates; y is a tuple (date, df with unique date and identity)
                for i, (_, df_date) in enumerate(dates):
                    # Add half dates to the training and half to the testing set
                    if i < np.minimum(n_dates-1, int(np.round(self.ratio*n_dates))):
                        idx_train += list(df_date.index)
                    else:
                        idx_test += list(df_date.index)
            else:
                idx_train += list(df_name.index)
        return [(np.array(idx_train), np.array(idx_test))]

`init(ratio=0.5, **kwargs)`

Initializes the class.

Parameters:

Name	Type	Description	Default
`ratio`	`float`	The fraction of dates going to the training set.	`0.5`
`**kwargs`	`type`	See kwargs `seed`, `identity_skip` and `col_label` of the parent class.	`{}`

Source code in wildlife_datasets/splits/time_aware_split.py

def __init__(
        self,
        ratio: float = 0.5,
        **kwargs
        ):
    """Initializes the class.

    Args:
        ratio (float, optional): The fraction of dates going to the training set.
        **kwargs (type, optional): See kwargs `seed`, `identity_skip` and `col_label` of the parent class.
    """

    self.ratio = ratio
    super().__init__(**kwargs)

`split(df)`

Implementation of the base splitting method.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain columns `identity` and `date`.	required

Returns:

Type	Description
`List[Tuple[ndarray, ndarray]]`	List of splits. Each split is list of labels of the training and testing sets.

Source code in wildlife_datasets/splits/time_aware_split.py

def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

    Returns:
        List of splits. Each split is list of labels of the training and testing sets.
    """

    df = self.modify_df(df)
    idx_train = []
    idx_test = []
    # Loop over all identities; x is a tuple (identity, df with unique identity)
    for _, df_name in df.groupby(self.col_label):            
        dates = df_name.groupby('date')
        n_dates = len(dates)
        if n_dates > 1:
            # Loop over all dates; y is a tuple (date, df with unique date and identity)
            for i, (_, df_date) in enumerate(dates):
                # Add half dates to the training and half to the testing set
                if i < np.minimum(n_dates-1, int(np.round(self.ratio*n_dates))):
                    idx_train += list(df_date.index)
                else:
                    idx_test += list(df_date.index)
        else:
            idx_train += list(df_name.index)
    return [(np.array(idx_train), np.array(idx_test))]

Time-cutoff split

Bases: TimeAwareSplit

Time-cutoff non-random splitting method into training and testing sets.

Puts all individuals observed before year into the training test. Puts all individuals observed during year into the testing test. Ignores all individuals observed after year. Implementation of this paper.

Source code in wildlife_datasets/splits/time_aware_split.py

class TimeCutoffSplit(TimeAwareSplit):
    """Time-cutoff non-random splitting method into training and testing sets.

    Puts all individuals observed before `year` into the training test.
    Puts all individuals observed during `year` into the testing test.
    Ignores all individuals observed after `year`.
    Implementation of [this paper](https://arxiv.org/abs/2211.10307).
    """

    def __init__(
            self,
            year: int,
            test_one_year_only: bool = True,
            **kwargs
            ) -> None:
        """Initializes the class.

        Args:
            year (int): Splitting year.
            test_one_year_only (bool, optional): Whether the test set is `df['year'] == year` or `df['year'] >= year`.
            **kwargs (type, optional): See kwargs `seed`, `identity_skip` and `col_label` of the parent class.
        """

        self.year = year
        self.test_one_year_only = test_one_year_only
        super().__init__(**kwargs)

    def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

        Args:
            df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

        Returns:
            List of labels of the training and testing sets.
        """

        df = self.modify_df(df)
        idx_train = list(np.where(df['year'] < self.year)[0])
        if self.test_one_year_only:
            idx_test = list(np.where(df['year'] == self.year)[0])
        else:
            idx_test = list(np.where(df['year'] >= self.year)[0])
        return [(np.array(df.index.values)[idx_train], np.array(df.index.values)[idx_test])]

`init(year, test_one_year_only=True, **kwargs)`

Initializes the class.

Parameters:

Name	Type	Description	Default
`year`	`int`	Splitting year.	required
`test_one_year_only`	`bool`	Whether the test set is `df['year'] == year` or `df['year'] >= year`.	`True`
`**kwargs`	`type`	See kwargs `seed`, `identity_skip` and `col_label` of the parent class.	`{}`

Source code in wildlife_datasets/splits/time_aware_split.py

def __init__(
        self,
        year: int,
        test_one_year_only: bool = True,
        **kwargs
        ) -> None:
    """Initializes the class.

    Args:
        year (int): Splitting year.
        test_one_year_only (bool, optional): Whether the test set is `df['year'] == year` or `df['year'] >= year`.
        **kwargs (type, optional): See kwargs `seed`, `identity_skip` and `col_label` of the parent class.
    """

    self.year = year
    self.test_one_year_only = test_one_year_only
    super().__init__(**kwargs)

`split(df)`

Implementation of the base splitting method.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of the data. It must contain columns `identity` and `date`.	required

Returns:

Type	Description
`List[Tuple[ndarray, ndarray]]`	List of labels of the training and testing sets.

Source code in wildlife_datasets/splits/time_aware_split.py

def split(self, df: pd.DataFrame) -> List[Tuple[np.ndarray, np.ndarray]]:
    """Implementation of the [base splitting method](../reference_splits#splits.balanced_split.BalancedSplit.split).

    Args:
        df (pd.DataFrame): A dataframe of the data. It must contain columns `identity` and `date`.

    Returns:
        List of labels of the training and testing sets.
    """

    df = self.modify_df(df)
    idx_train = list(np.where(df['year'] < self.year)[0])
    if self.test_one_year_only:
        idx_test = list(np.where(df['year'] == self.year)[0])
    else:
        idx_test = list(np.where(df['year'] >= self.year)[0])
    return [(np.array(df.index.values)[idx_train], np.array(df.index.values)[idx_test])]

Lcg

Linear congruential generator for generating random numbers.

Copied from StackOverflow. It is machine-, distribution- and package version-independent. It has some drawbacks (check the link above) but perfectly sufficient for our application.

Attributes:

Name	Type	Description
`state`	`int`	Random state of the LCG.

Source code in wildlife_datasets/splits/lcg.py

class Lcg():
    """Linear congruential generator for generating random numbers.

    Copied from [StackOverflow](https://stackoverflow.com/questions/18634079/glibc-rand-function-implementation).
    It is machine-, distribution- and package version-independent.
    It has some drawbacks (check the link above) but perfectly sufficient for our application.

    Attributes:
      state (int): Random state of the LCG.
    """

    def __init__(self, seed: int, iterate: int=0) -> None:
        """Initialization function for LCG.

        Args:
            seed (int): Initial random seed.
            iterate (int, optional): Number of initial random iterations.
        """
        self.state = seed
        for _ in range(iterate):
            self.random()

    def random(self) -> int:
        """Generates a new random integer from the current state.

        Returns:
            New random integer.
        """

        self.state = (self.state * 1103515245 + 12345) & 0x7FFFFFFF
        return self.state

    def random_permutation(self, n: int) -> np.ndarray:
        """Generates a random permutation of `range(n)`.

        Args:
            n (int): Length of the sequence to be permuted.

        Returns:
            Permuted sequence.
        """

        rnd = []
        for _ in range(n):
            self.random()
            rnd.append(self.state)
        return np.argsort(rnd)

    def random_shuffle(self, x: np.ndarray) -> np.ndarray:
        """Generates a random shuffle of `x`.

        Args:
            x (np.ndarray): Array to be permuted.

        Returns:
            Shuffled array.
        """

        return np.array(x)[self.random_permutation(len(x))]

`init(seed, iterate=0)`

Initialization function for LCG.

Parameters:

Name	Type	Description	Default
`seed`	`int`	Initial random seed.	required
`iterate`	`int`	Number of initial random iterations.	`0`

Source code in wildlife_datasets/splits/lcg.py

def __init__(self, seed: int, iterate: int=0) -> None:
    """Initialization function for LCG.

    Args:
        seed (int): Initial random seed.
        iterate (int, optional): Number of initial random iterations.
    """
    self.state = seed
    for _ in range(iterate):
        self.random()

`random()`

Generates a new random integer from the current state.

Returns:

Type	Description
`int`	New random integer.

Source code in wildlife_datasets/splits/lcg.py

def random(self) -> int:
    """Generates a new random integer from the current state.

    Returns:
        New random integer.
    """

    self.state = (self.state * 1103515245 + 12345) & 0x7FFFFFFF
    return self.state

`random_permutation(n)`

Generates a random permutation of range(n).

Parameters:

Name	Type	Description	Default
`n`	`int`	Length of the sequence to be permuted.	required

Returns:

Type	Description
`ndarray`	Permuted sequence.

Source code in wildlife_datasets/splits/lcg.py

def random_permutation(self, n: int) -> np.ndarray:
    """Generates a random permutation of `range(n)`.

    Args:
        n (int): Length of the sequence to be permuted.

    Returns:
        Permuted sequence.
    """

    rnd = []
    for _ in range(n):
        self.random()
        rnd.append(self.state)
    return np.argsort(rnd)

`random_shuffle(x)`

Generates a random shuffle of x.

Parameters:

Name	Type	Description	Default
`x`	`ndarray`	Array to be permuted.	required

Returns:

Type	Description
`ndarray`	Shuffled array.

Source code in wildlife_datasets/splits/lcg.py

def random_shuffle(self, x: np.ndarray) -> np.ndarray:
    """Generates a random shuffle of `x`.

    Args:
        x (np.ndarray): Array to be permuted.

    Returns:
        Shuffled array.
    """

    return np.array(x)[self.random_permutation(len(x))]

Reference splitting functions

Balanced split

compute_clusters(df, features, n_max_cluster=5, eps_min=0.01, eps_max=0.5, eps_step=0.01, min_samples=2)

initialize_lcg()

modify_df(df)

resplit_by_clusters(df, clusters, idx_train)

resplit_by_features(df, features, idx_train, save_clusters_prefix=None, **kwargs)

resplit_random(df, idx_train, idx_test)

set_col_label(col_label)

split(*args, **kwargs)

Identity split

general_split(df, individual_train, individual_test)

Closed-set split

__init__(ratio_train, **kwargs)

split(df)

Open-set split

__init__(ratio_train, ratio_class_test=None, n_class_test=None, open_in_test=True, **kwargs)

split(df)

Disjoint-set split

__init__(ratio_class_test=None, n_class_test=None, **kwargs)

split(df)

Time-aware split

modify_df(df)

Time-proportion split

__init__(ratio=0.5, **kwargs)

split(df)

Time-cutoff split

__init__(year, test_one_year_only=True, **kwargs)

split(df)

Lcg

__init__(seed, iterate=0)

random()

random_permutation(n)

random_shuffle(x)

`compute_clusters(df, features, n_max_cluster=5, eps_min=0.01, eps_max=0.5, eps_step=0.01, min_samples=2)`

`initialize_lcg()`

`modify_df(df)`

`resplit_by_clusters(df, clusters, idx_train)`

`resplit_by_features(df, features, idx_train, save_clusters_prefix=None, **kwargs)`

`resplit_random(df, idx_train, idx_test)`

`set_col_label(col_label)`

`split(*args, **kwargs)`

`general_split(df, individual_train, individual_test)`

`init(ratio_train, **kwargs)`

`split(df)`

`init(ratio_train, ratio_class_test=None, n_class_test=None, open_in_test=True, **kwargs)`

`split(df)`

`init(ratio_class_test=None, n_class_test=None, **kwargs)`

`split(df)`

`modify_df(df)`

`init(ratio=0.5, **kwargs)`

`split(df)`

`init(year, test_one_year_only=True, **kwargs)`

`split(df)`

`init(seed, iterate=0)`

`random()`

`random_permutation(n)`

`random_shuffle(x)`