Skip to content

Datasets

The Dataset class enables loading and manipulating the datasets. The datasets are contained in the folder data and divided into two subfolders:

  • real → Real World Datasets
  • syn → Synthetic Datasets

Dataset dataclass

A class to represent a dataset.

Attributes:

Name Type Description
name str

The name of the dataset.

path str

The path to the dataset file.

feature_names_filepath Optional[str]

The path to the json file containing the feature names of the dataset.

X Optional[NDArray]

Data matrix of the dataset.

y Optional[NDArray]

The labels of the dataset.

X_train Optional[NDArray]

Training set, initialized to None

y_train Optional[NDArray]

The labels of the training set

X_test Optional[NDArray]

Test set, initialized to None

y_test Optional[NDArray]

The labels of the test set

feature_names Optional[List[str]]

The names of the features of the dataset.

Source code in utils_reboot/datasets.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
@dataclass
class Dataset:
    """
    A class to represent a dataset.

    Attributes:
        name: The name of the dataset.
        path: The path to the dataset file.
        feature_names_filepath: The path to the json file containing the feature names of the dataset.
        X: Data matrix of the dataset.
        y: The labels of the dataset.
        X_train: Training set, initialized to None
        y_train: The labels of the training set
        X_test: Test set, initialized to None
        y_test: The labels of the test set
        feature_names: The names of the features of the dataset.
    """
    name: str
    path: str = "../data/"
    feature_names_filepath: Optional[str] = None
    X: Optional[npt.NDArray] = field(default=None, init=False)
    y: Optional[npt.NDArray] = field(default=None, init=False)
    X_train: Optional[npt.NDArray] = field(default=None, init=False)
    y_train: Optional[npt.NDArray] = field(default=None, init=False)
    X_test: Optional[npt.NDArray] = field(default=None, init=False)
    y_test: Optional[npt.NDArray] = field(default=None, init=False)
    feature_names: Optional[List[str]] = field(default=None, init=False)

    def __post_init__(self) -> None:
        """Initialize the dataset.

        Load the dataset from the file and set the feature names.

        """
        self.load()

        if self.feature_names_filepath is not None:
            self.dataset_feature_names()

        if self.feature_names is None:
            self.feature_names=np.arange(self.shape[1])

    @property
    def shape(self) -> tuple:
        """
        Return the shape of the dataset.

        Returns:
            The shape of the dataset.
        """
        return self.X.shape if self.X is not None else ()

    @property
    def n_outliers(self) -> int:
        """
        Return the number of outliers in the dataset.

        Returns:
            The number of outliers in the dataset.
        """
        return int(sum(self.y)) if self.y is not None else 0

    @property
    def perc_outliers(self) -> float:
        """
        Return the percentage of outliers in the dataset (i.e. the contamination factor)

        Returns:
            The percentage of outliers in the dataset.
        """
        return sum(self.y) / len(self.y) if self.y is not None else 0.0

    def load(self) -> None:
        """
        Load the dataset from the file.

        Raises:
            FileNotFoundError: If the dataset file is not found.
            Exception: If the dataset name is not valid.

        Returns:
            The dataset is loaded in place.
        """
        try:
            datapath = self.path + self.name + ".mat"
            try:
                mat = loadmat(datapath)
            except NotImplementedError:
                mat = mat73.loadmat(datapath)

            self.X = mat['X'].astype(float)
            self.y = mat['y'].reshape(-1, 1).astype(float)
        except FileNotFoundError:
            try:
                datapath = self.path + self.name + ".csv"
                T = pd.read_csv(datapath)
                if 'Unnamed: 0' in T.columns:
                    T = T.drop(columns=['Unnamed: 0'])
                self.X = T['X'].to_numpy(dtype=float)
                self.y = T['y'].to_numpy(dtype=float).reshape(-1, 1)
            except Exception as e:
                try:
                    datapath = self.path + self.name + ".csv"
                    if self.name == "glass":
                        T = pd.read_csv(datapath)
                    else:
                        T = pd.read_csv(datapath,index_col=0)
                    if 'Unnamed: 0' in T.columns:
                        T = T.drop(columns=['Unnamed: 0'])
                    self.X = T.loc[:,T.columns != "Target"].to_numpy(float)
                    self.y = T.loc[:,"Target"].to_numpy(float)
                except:
                    raise Exception("The dataset name is not valid") from e


    def __repr__(self) -> str:
        return f"[{self.name}][{self.shape}][{self.n_outliers}]"

    def drop_duplicates(self) -> None:
        """
        Drop duplicate samples from the dataset.

        Returns:
            The dataset is modified in place.
        """
        S = np.c_[self.X, self.y]
        S = pd.DataFrame(S).drop_duplicates().to_numpy()
        self.X, self.y = S[:, :-1], S[:, -1]

    def downsample(self, max_samples: int = 2500) -> None:
        """
        Downsample the dataset to a maximum number of samples.

        Args:
            max_samples: The maximum number of samples to keep in the dataset.

        Returns:
            The dataset is modified in place.
        """
        if len(self.X) > max_samples:
            print("downsampled to ", max_samples)
            sss = SSS(n_splits=1, test_size=1 - max_samples / len(self.X))
            index = list(sss.split(self.X, self.y))[0][0]
            self.X, self.y = self.X[index, :], self.y[index]

    def partition_data(self,X:np.array,y:np.array) -> tuple:

        # Ensure that X and y are not None
        if self.X is None or self.y is None:
            print("Dataset not loaded.")
            return
        try:
            inliers = X[y == 0, :]
            outliers = X[y == 1, :]
            y_inliers= y[y == 0]
            y_outliers= y[y == 1]
        except TypeError:
            print('X_train and y_train not loaded yet. Run split_dataset() first')
            return 
        return inliers, outliers,y_inliers,y_outliers

    def print_dataset_resume(self) -> None:
        """
        Print a summary of the dataset.

        The summary includes the number of samples, the number of features, the number of inliers and outliers and some
        summary statistics of the features.

        Returns:
            The dataset summary is printed.

        """
        # Ensure that X and y are not None
        if self.X is None or self.y is None:
            print("Dataset not loaded.")
            return

        # Basic statistics
        num_samples = len(self.X)
        num_features = self.X.shape[1] if self.X is not None else 0
        num_inliers = np.sum(self.y == 0)
        num_outliers = np.sum(self.y == 1)
        balance_ratio = num_outliers / num_samples

        # Aggregate statistics for features in X
        mean_values = np.mean(self.X, axis=0)
        std_dev_values = np.std(self.X, axis=0)
        min_values = np.min(self.X, axis=0)
        max_values = np.max(self.X, axis=0)

        # Compact representation of statistics
        mean_val = np.mean(mean_values)
        std_dev_val = np.mean(std_dev_values)
        min_val = np.min(min_values)
        max_val = np.max(max_values)

        # Print the summary
        print(f"Dataset Summary for '{self.name}':")
        print(f" Total Samples: {num_samples}, Features: {num_features}")
        print(f" Inliers: {num_inliers}, Outliers: {num_outliers}, Balance Ratio: {balance_ratio:.2f}")
        print(f" Feature Stats - Mean: {mean_val:.2f}, Std Dev: {std_dev_val:.2f}, Min: {min_val}, Max: {max_val}")


    def split_dataset(self, 
                      train_size:float = 0.8, 
                      contamination:float = 0.1) -> None:

        """
        Split the dataset into training and test sets with a given train size and contamination factor.

        Args:
            train_size: The proportion of the dataset to include in the training set.
            contamination: The proportion of outliers in the dataset.

        Returns:
            The dataset is split into training and test sets in place

        """
        # Ensure that X and y are not None
        if self.X is None or self.y is None:
            print("Dataset not loaded.")
            return

        # Check if train_size is correct
        if train_size > 1 - self.perc_outliers:
            print("Train size is too large. Setting it at 1-dataset.perc_outliers.")
            train_size = 1 - self.perc_outliers

        indexes_outliers = np.where(self.y==1)[0].tolist()
        indexes_inliers = np.where(self.y==0)[0].tolist()
        random.shuffle(indexes_outliers)
        random.shuffle(indexes_inliers)
        dim_train = int(len(self.X)*train_size)
        self.X_train = np.zeros((dim_train,self.X.shape[1]))
        self.y_train = np.zeros(dim_train)
        for i in range(dim_train):
            if i < dim_train*contamination and len(indexes_outliers) > 0:
                index = indexes_outliers.pop()
            else:
                index = indexes_inliers.pop()
            self.X_train[i] = self.X[index]
            self.y_train[i] = self.y[index]

    def pre_process(self) -> None:

        """
        Normalize the data using `StansardScaler()` from `sklearn.preprocessing`.

        Returns:
           The dataset is normalized in place.
        """

        # Ensure that X and y are not None
        if self.X is None or self.y is None:
            print("Dataset not loaded.")
            return
        if self.X_train is None:
            self.initialize_train_test()
        if self.X_test is None:
            self.initialize_test()

        scaler = StandardScaler()

        self.X_train=scaler.fit_transform(self.X_train)
        self.X_test=scaler.transform(self.X_test)

    def initialize_train_test(self) -> None:

        """
        Initialize the training and test sets with the original dataset. 

        This method is used when `split_dataset()` has not been called before `pre_process()`.

        Returns:
            The training and test sets are initialized in place.
        """
        # Ensure that X and y are not None
        if self.X is None or self.y is None:
            print("Dataset not loaded.")
            return
        if self.X_train is None:
            self.initialize_train()
        if self.X_test is None:
            self.initialize_test()

    def initialize_test(self) ->None:

        """
        Initialize the test set with the original dataset. 

        This method is used when `split_dataset()` has not been called before `pre_process()`.

        Returns:
            The test set is initialized in place.
        """

        self.X_test=copy.deepcopy(self.X)
        self.y_test=copy.deepcopy(self.y)


    def initialize_train(self) ->None:

        """
        Initialize the train set with the original dataset. 

        This method is used when `split_dataset()` has not been called before `pre_process()`.

        Returns:
            The training set is initalized in place.
        """

        self.X_train=copy.deepcopy(self.X)
        self.y_train=copy.deepcopy(self.y)

    def dataset_feature_names(self) -> List[str]:

            """ 
            Set the feture names for the datasets for which the feature names are available 

            Returns:
                Set the feature_names attributes to a list of string containing the feature names of the dataset.
            """
            with open(self.feature_names_filepath+'data_feature_names.json','r') as f:
                data_feature_names=json.load(f)

            if self.name in data_feature_names:    
                self.feature_names=data_feature_names[self.name]
            else:
                self.feature_names=None 

n_outliers: int property

Return the number of outliers in the dataset.

Returns:

Type Description
int

The number of outliers in the dataset.

perc_outliers: float property

Return the percentage of outliers in the dataset (i.e. the contamination factor)

Returns:

Type Description
float

The percentage of outliers in the dataset.

shape: tuple property

Return the shape of the dataset.

Returns:

Type Description
tuple

The shape of the dataset.

__post_init__()

Initialize the dataset.

Load the dataset from the file and set the feature names.

Source code in utils_reboot/datasets.py
51
52
53
54
55
56
57
58
59
60
61
62
63
def __post_init__(self) -> None:
    """Initialize the dataset.

    Load the dataset from the file and set the feature names.

    """
    self.load()

    if self.feature_names_filepath is not None:
        self.dataset_feature_names()

    if self.feature_names is None:
        self.feature_names=np.arange(self.shape[1])

dataset_feature_names()

Set the feture names for the datasets for which the feature names are available

Returns:

Type Description
List[str]

Set the feature_names attributes to a list of string containing the feature names of the dataset.

Source code in utils_reboot/datasets.py
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
def dataset_feature_names(self) -> List[str]:

        """ 
        Set the feture names for the datasets for which the feature names are available 

        Returns:
            Set the feature_names attributes to a list of string containing the feature names of the dataset.
        """
        with open(self.feature_names_filepath+'data_feature_names.json','r') as f:
            data_feature_names=json.load(f)

        if self.name in data_feature_names:    
            self.feature_names=data_feature_names[self.name]
        else:
            self.feature_names=None 

downsample(max_samples=2500)

Downsample the dataset to a maximum number of samples.

Parameters:

Name Type Description Default
max_samples int

The maximum number of samples to keep in the dataset.

2500

Returns:

Type Description
None

The dataset is modified in place.

Source code in utils_reboot/datasets.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def downsample(self, max_samples: int = 2500) -> None:
    """
    Downsample the dataset to a maximum number of samples.

    Args:
        max_samples: The maximum number of samples to keep in the dataset.

    Returns:
        The dataset is modified in place.
    """
    if len(self.X) > max_samples:
        print("downsampled to ", max_samples)
        sss = SSS(n_splits=1, test_size=1 - max_samples / len(self.X))
        index = list(sss.split(self.X, self.y))[0][0]
        self.X, self.y = self.X[index, :], self.y[index]

drop_duplicates()

Drop duplicate samples from the dataset.

Returns:

Type Description
None

The dataset is modified in place.

Source code in utils_reboot/datasets.py
141
142
143
144
145
146
147
148
149
150
def drop_duplicates(self) -> None:
    """
    Drop duplicate samples from the dataset.

    Returns:
        The dataset is modified in place.
    """
    S = np.c_[self.X, self.y]
    S = pd.DataFrame(S).drop_duplicates().to_numpy()
    self.X, self.y = S[:, :-1], S[:, -1]

initialize_test()

Initialize the test set with the original dataset.

This method is used when split_dataset() has not been called before pre_process().

Returns:

Type Description
None

The test set is initialized in place.

Source code in utils_reboot/datasets.py
308
309
310
311
312
313
314
315
316
317
318
319
320
def initialize_test(self) ->None:

    """
    Initialize the test set with the original dataset. 

    This method is used when `split_dataset()` has not been called before `pre_process()`.

    Returns:
        The test set is initialized in place.
    """

    self.X_test=copy.deepcopy(self.X)
    self.y_test=copy.deepcopy(self.y)

initialize_train()

Initialize the train set with the original dataset.

This method is used when split_dataset() has not been called before pre_process().

Returns:

Type Description
None

The training set is initalized in place.

Source code in utils_reboot/datasets.py
323
324
325
326
327
328
329
330
331
332
333
334
335
def initialize_train(self) ->None:

    """
    Initialize the train set with the original dataset. 

    This method is used when `split_dataset()` has not been called before `pre_process()`.

    Returns:
        The training set is initalized in place.
    """

    self.X_train=copy.deepcopy(self.X)
    self.y_train=copy.deepcopy(self.y)

initialize_train_test()

Initialize the training and test sets with the original dataset.

This method is used when split_dataset() has not been called before pre_process().

Returns:

Type Description
None

The training and test sets are initialized in place.

Source code in utils_reboot/datasets.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
def initialize_train_test(self) -> None:

    """
    Initialize the training and test sets with the original dataset. 

    This method is used when `split_dataset()` has not been called before `pre_process()`.

    Returns:
        The training and test sets are initialized in place.
    """
    # Ensure that X and y are not None
    if self.X is None or self.y is None:
        print("Dataset not loaded.")
        return
    if self.X_train is None:
        self.initialize_train()
    if self.X_test is None:
        self.initialize_test()

load()

Load the dataset from the file.

Raises:

Type Description
FileNotFoundError

If the dataset file is not found.

Exception

If the dataset name is not valid.

Returns:

Type Description
None

The dataset is loaded in place.

Source code in utils_reboot/datasets.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def load(self) -> None:
    """
    Load the dataset from the file.

    Raises:
        FileNotFoundError: If the dataset file is not found.
        Exception: If the dataset name is not valid.

    Returns:
        The dataset is loaded in place.
    """
    try:
        datapath = self.path + self.name + ".mat"
        try:
            mat = loadmat(datapath)
        except NotImplementedError:
            mat = mat73.loadmat(datapath)

        self.X = mat['X'].astype(float)
        self.y = mat['y'].reshape(-1, 1).astype(float)
    except FileNotFoundError:
        try:
            datapath = self.path + self.name + ".csv"
            T = pd.read_csv(datapath)
            if 'Unnamed: 0' in T.columns:
                T = T.drop(columns=['Unnamed: 0'])
            self.X = T['X'].to_numpy(dtype=float)
            self.y = T['y'].to_numpy(dtype=float).reshape(-1, 1)
        except Exception as e:
            try:
                datapath = self.path + self.name + ".csv"
                if self.name == "glass":
                    T = pd.read_csv(datapath)
                else:
                    T = pd.read_csv(datapath,index_col=0)
                if 'Unnamed: 0' in T.columns:
                    T = T.drop(columns=['Unnamed: 0'])
                self.X = T.loc[:,T.columns != "Target"].to_numpy(float)
                self.y = T.loc[:,"Target"].to_numpy(float)
            except:
                raise Exception("The dataset name is not valid") from e

pre_process()

Normalize the data using StansardScaler() from sklearn.preprocessing.

Returns:

Type Description
None

The dataset is normalized in place.

Source code in utils_reboot/datasets.py
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
def pre_process(self) -> None:

    """
    Normalize the data using `StansardScaler()` from `sklearn.preprocessing`.

    Returns:
       The dataset is normalized in place.
    """

    # Ensure that X and y are not None
    if self.X is None or self.y is None:
        print("Dataset not loaded.")
        return
    if self.X_train is None:
        self.initialize_train_test()
    if self.X_test is None:
        self.initialize_test()

    scaler = StandardScaler()

    self.X_train=scaler.fit_transform(self.X_train)
    self.X_test=scaler.transform(self.X_test)

print_dataset_resume()

Print a summary of the dataset.

The summary includes the number of samples, the number of features, the number of inliers and outliers and some summary statistics of the features.

Returns:

Type Description
None

The dataset summary is printed.

Source code in utils_reboot/datasets.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def print_dataset_resume(self) -> None:
    """
    Print a summary of the dataset.

    The summary includes the number of samples, the number of features, the number of inliers and outliers and some
    summary statistics of the features.

    Returns:
        The dataset summary is printed.

    """
    # Ensure that X and y are not None
    if self.X is None or self.y is None:
        print("Dataset not loaded.")
        return

    # Basic statistics
    num_samples = len(self.X)
    num_features = self.X.shape[1] if self.X is not None else 0
    num_inliers = np.sum(self.y == 0)
    num_outliers = np.sum(self.y == 1)
    balance_ratio = num_outliers / num_samples

    # Aggregate statistics for features in X
    mean_values = np.mean(self.X, axis=0)
    std_dev_values = np.std(self.X, axis=0)
    min_values = np.min(self.X, axis=0)
    max_values = np.max(self.X, axis=0)

    # Compact representation of statistics
    mean_val = np.mean(mean_values)
    std_dev_val = np.mean(std_dev_values)
    min_val = np.min(min_values)
    max_val = np.max(max_values)

    # Print the summary
    print(f"Dataset Summary for '{self.name}':")
    print(f" Total Samples: {num_samples}, Features: {num_features}")
    print(f" Inliers: {num_inliers}, Outliers: {num_outliers}, Balance Ratio: {balance_ratio:.2f}")
    print(f" Feature Stats - Mean: {mean_val:.2f}, Std Dev: {std_dev_val:.2f}, Min: {min_val}, Max: {max_val}")

split_dataset(train_size=0.8, contamination=0.1)

Split the dataset into training and test sets with a given train size and contamination factor.

Parameters:

Name Type Description Default
train_size float

The proportion of the dataset to include in the training set.

0.8
contamination float

The proportion of outliers in the dataset.

0.1

Returns:

Type Description
None

The dataset is split into training and test sets in place

Source code in utils_reboot/datasets.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
def split_dataset(self, 
                  train_size:float = 0.8, 
                  contamination:float = 0.1) -> None:

    """
    Split the dataset into training and test sets with a given train size and contamination factor.

    Args:
        train_size: The proportion of the dataset to include in the training set.
        contamination: The proportion of outliers in the dataset.

    Returns:
        The dataset is split into training and test sets in place

    """
    # Ensure that X and y are not None
    if self.X is None or self.y is None:
        print("Dataset not loaded.")
        return

    # Check if train_size is correct
    if train_size > 1 - self.perc_outliers:
        print("Train size is too large. Setting it at 1-dataset.perc_outliers.")
        train_size = 1 - self.perc_outliers

    indexes_outliers = np.where(self.y==1)[0].tolist()
    indexes_inliers = np.where(self.y==0)[0].tolist()
    random.shuffle(indexes_outliers)
    random.shuffle(indexes_inliers)
    dim_train = int(len(self.X)*train_size)
    self.X_train = np.zeros((dim_train,self.X.shape[1]))
    self.y_train = np.zeros(dim_train)
    for i in range(dim_train):
        if i < dim_train*contamination and len(indexes_outliers) > 0:
            index = indexes_outliers.pop()
        else:
            index = indexes_inliers.pop()
        self.X_train[i] = self.X[index]
        self.y_train[i] = self.y[index]