Models

This section provides an overview of the implementation of IF (class IsolationForest), EIF,EIF+ and EXIFFI (all implemebted in class ExtendedIsolationForest). In order to achieve a speed up in the computations the numba Pyhton compiler is used.

`ExtendedIsolationForest`

Class that represents the Extended Isolation Forest model.

Attributes:

Name	Type	Description
`n_estimators`	`int`	Number of trees in the model. Defaults to 400
`max_samples`	`int`	Maximum number of samples in a node. Defaults to 256
`max_depth`	`int`	Maximum depth of the trees. Defaults to "auto"
`plus`	`bool`	Boolean flag to indicate if the model is a `EIF` or `EIF+`.
`name`	`str`	Name of the model
`ids`	`array`	Leaf node ids for each data point in the dataset. Defaults to None
`X`	`array`	Input dataset. Defaults to None
`eta`	`float`	Eta value for the model. Defaults to 1.5
`avg_number_of_nodes`	`int`	Average number of nodes in the trees

Source code in model_reboot/EIF_reboot.py

class ExtendedIsolationForest():

    """
    Class that represents the Extended Isolation Forest model.

    Attributes:
        n_estimators (int): Number of trees in the model. Defaults to 400
        max_samples (int): Maximum number of samples in a node. Defaults to 256
        max_depth (int): Maximum depth of the trees. Defaults to "auto"
        plus (bool): Boolean flag to indicate if the model is a `EIF` or `EIF+`.
        name (str): Name of the model
        ids (np.array): Leaf node ids for each data point in the dataset. Defaults to None
        X (np.array): Input dataset. Defaults to None
        eta (float): Eta value for the model. Defaults to 1.5
        avg_number_of_nodes (int): Average number of nodes in the trees

    """

    def __init__(self,
                 plus:bool,
                 n_estimators:int=400,
                 max_depth:Union[str,int]="auto",
                 max_samples:Union[str,int]="auto",
                 eta:float = 1.5):
        self.n_estimators = n_estimators
        self.max_samples = 256 if max_samples == "auto" else max_samples
        self.max_depth = max_depth
        self.plus=plus
        self.name="EIF"+"+"*int(plus)
        self.ids=None
        self.X=None
        self.eta=eta

    @property
    def avg_number_of_nodes(self) -> float:
        """
        Compute the average number of nodes in the trees.

        Returns:
            The average number of nodes in the trees.

        """
        return np.mean([T.node_count for T in self.trees])

    def fit(self, X:np.array, locked_dims:int=None) -> None:

        """
        Fit the model to the dataset.

        Args:
            X: Input dataset
            locked_dims: Number of dimensions to be locked in the model. Defaults to None

        Returns:
            The method fits the model and does not return any value.
        """

        self.ids = None
        if not locked_dims:
            locked_dims = 0

        if self.max_depth == "auto":
            self.max_depth = int(np.ceil(np.log2(self.max_samples)))
        subsample_size = np.min((self.max_samples, len(X)))
        self.trees = [ExtendedTree(subsample_size, X.shape[1], self.max_depth, locked_dims=locked_dims, plus=self.plus, eta=self.eta)
                      for _ in range(self.n_estimators)]
        for T in self.trees:
            T.fit(X[np.random.randint(len(X), size=subsample_size)])

    def compute_ids(self, X:np.array) -> None:

        """
        Compute the leaf node ids for each data point in the dataset.

        Args:
            X: Input dataset

        Returns:
            The method computes the leaf node ids and does not return any value.
        """
        if self.ids is None or self.X.shape != X.shape:
            self.X = X
            self.ids = np.array([tree.leaf_ids(X) for tree in self.trees])

    def predict(self, X:np.array) -> np.array:

        """
        Predict the anomaly score for each data point in the dataset.

        Args:
            X: Input dataset

        Returns:
            Anomaly score for each data point in the dataset.
        """
        self.compute_ids(X)
        #predictions=[tree.predict(X,self.ids[i]) for i,tree in enumerate(self.trees)]
        predictions=[tree.predict(self.ids[i]) for i,tree in enumerate(self.trees)]
        values = np.array([p[0] for p in predictions])
        return np.power(2,-np.mean([value for value in values], axis=0))

    def _predict(self,
                 X:np.array,
                 p:float) -> np.array:
        """
        Predict the class of each data point (i.e. inlier or outlier) based on the anomaly score.

        Args:
            X: Input dataset
            p: Proportion of outliers (i.e. threshold for the anomaly score)

        Returns:
           Class labels (i.e. 0 for inliers and 1 for outliers)
        """
        An_score = self.predict(X)
        y_hat = An_score > sorted(An_score,reverse=True)[int(p*len(An_score))]
        return y_hat

    def _importances(self,
                     X:np.array,
                     ids:np.array) -> tuple[np.array,np.array]:

        """
        Compute the importances of the features for the given leaf node ids.

        Args:
            X: Input dataset
            ids: Leaf node ids for each data point in the dataset.

        Returns:
            Importances of the features for the given leaf node ids and the normal vectors.

        """
        importances = np.zeros(X.shape)
        normals = np.zeros(X.shape)
        for i,T in enumerate(self.trees):
            importance, normal = T.importances(ids[i])
            importances += importance
            normals += normal
        return importances/self.n_estimators, normals/self.n_estimators

    def global_importances(self,
                           X:np.array,
                           p:float=0.1) -> np.array:

        """
        Compute the global importances of the features for the dataset.

        Args:
            X: Input dataset
            p: Proportion of outliers (i.e. threshold for the anomaly score). Defaults to 0.1

        Returns:
            Global importances of the features for the dataset.
        """

        self.compute_ids(X)
        y_hat = self._predict(X,p)
        importances, normals = self._importances(X, self.ids)
        outliers_importances,outliers_normals = np.sum(importances[y_hat],axis=0),np.sum(normals[y_hat],axis=0) 
        inliers_importances,inliers_normals = np.sum(importances[~y_hat],axis=0),np.sum(normals[~y_hat],axis=0)
        return (outliers_importances/outliers_normals)/(inliers_importances/inliers_normals)

    def local_importances(self,
                          X:np.array) -> np.array:

        """
        Compute the local importances of the features for the dataset.

        Args:
            X: Input dataset

        Returns:
           Local importances of the features for the dataset.
        """

        self.compute_ids(X)
        importances, normals = self._importances(X, self.ids)
        return importances/normals

`avg_number_of_nodes: float` `property`

Compute the average number of nodes in the trees.

Returns:

Type	Description
`float`	The average number of nodes in the trees.

`compute_ids(X)`

Compute the leaf node ids for each data point in the dataset.

Parameters:

Name	Type	Description	Default
`X`	`array`	Input dataset	required

Returns:

Type	Description
`None`	The method computes the leaf node ids and does not return any value.

Source code in model_reboot/EIF_reboot.py

def compute_ids(self, X:np.array) -> None:

    """
    Compute the leaf node ids for each data point in the dataset.

    Args:
        X: Input dataset

    Returns:
        The method computes the leaf node ids and does not return any value.
    """
    if self.ids is None or self.X.shape != X.shape:
        self.X = X
        self.ids = np.array([tree.leaf_ids(X) for tree in self.trees])

`fit(X, locked_dims=None)`

Fit the model to the dataset.

Parameters:

Name	Type	Description	Default
`X`	`array`	Input dataset	required
`locked_dims`	`int`	Number of dimensions to be locked in the model. Defaults to None	`None`

Returns:

Type	Description
`None`	The method fits the model and does not return any value.

Source code in model_reboot/EIF_reboot.py

def fit(self, X:np.array, locked_dims:int=None) -> None:

    """
    Fit the model to the dataset.

    Args:
        X: Input dataset
        locked_dims: Number of dimensions to be locked in the model. Defaults to None

    Returns:
        The method fits the model and does not return any value.
    """

    self.ids = None
    if not locked_dims:
        locked_dims = 0

    if self.max_depth == "auto":
        self.max_depth = int(np.ceil(np.log2(self.max_samples)))
    subsample_size = np.min((self.max_samples, len(X)))
    self.trees = [ExtendedTree(subsample_size, X.shape[1], self.max_depth, locked_dims=locked_dims, plus=self.plus, eta=self.eta)
                  for _ in range(self.n_estimators)]
    for T in self.trees:
        T.fit(X[np.random.randint(len(X), size=subsample_size)])

`global_importances(X, p=0.1)`

Compute the global importances of the features for the dataset.

Parameters:

Name	Type	Description	Default
`X`	`array`	Input dataset	required
`p`	`float`	Proportion of outliers (i.e. threshold for the anomaly score). Defaults to 0.1	`0.1`

Returns:

Type	Description
`array`	Global importances of the features for the dataset.

Source code in model_reboot/EIF_reboot.py

def global_importances(self,
                       X:np.array,
                       p:float=0.1) -> np.array:

    """
    Compute the global importances of the features for the dataset.

    Args:
        X: Input dataset
        p: Proportion of outliers (i.e. threshold for the anomaly score). Defaults to 0.1

    Returns:
        Global importances of the features for the dataset.
    """

    self.compute_ids(X)
    y_hat = self._predict(X,p)
    importances, normals = self._importances(X, self.ids)
    outliers_importances,outliers_normals = np.sum(importances[y_hat],axis=0),np.sum(normals[y_hat],axis=0) 
    inliers_importances,inliers_normals = np.sum(importances[~y_hat],axis=0),np.sum(normals[~y_hat],axis=0)
    return (outliers_importances/outliers_normals)/(inliers_importances/inliers_normals)

`local_importances(X)`

Compute the local importances of the features for the dataset.

Parameters:

Name	Type	Description	Default
`X`	`array`	Input dataset	required

Returns:

Type	Description
`array`	Local importances of the features for the dataset.

Source code in model_reboot/EIF_reboot.py

def local_importances(self,
                      X:np.array) -> np.array:

    """
    Compute the local importances of the features for the dataset.

    Args:
        X: Input dataset

    Returns:
       Local importances of the features for the dataset.
    """

    self.compute_ids(X)
    importances, normals = self._importances(X, self.ids)
    return importances/normals

`predict(X)`

Predict the anomaly score for each data point in the dataset.

Parameters:

Name	Type	Description	Default
`X`	`array`	Input dataset	required

Returns:

Type	Description
`array`	Anomaly score for each data point in the dataset.

Source code in model_reboot/EIF_reboot.py

def predict(self, X:np.array) -> np.array:

    """
    Predict the anomaly score for each data point in the dataset.

    Args:
        X: Input dataset

    Returns:
        Anomaly score for each data point in the dataset.
    """
    self.compute_ids(X)
    #predictions=[tree.predict(X,self.ids[i]) for i,tree in enumerate(self.trees)]
    predictions=[tree.predict(self.ids[i]) for i,tree in enumerate(self.trees)]
    values = np.array([p[0] for p in predictions])
    return np.power(2,-np.mean([value for value in values], axis=0))

`ExtendedTree`

Class that represents an Isolation Tree in the Extended Isolation Forest model.

Attributes:

Name	Type	Description
`plus`	`bool`	Boolean flag to indicate if the model is a `EIF` or `EIF+`. Defaults to True (i.e. `EIF+`)
`locked_dims`	`int`	Number of dimensions to be locked in the model. Defaults to 0
`max_depth`	`int`	Maximum depth of the tree
`min_sample`	`int`	Minimum number of samples in a node. Defaults to 1
`n`	`int`	Number of samples in the dataset
`d`	`int`	Number of dimensions in the dataset
`node_count`	`int`	Counter for the number of nodes in the tree
`max_nodes`	`int`	Maximum number of nodes in the tree. Defaults to 10000
`path_to`	`array`	Array to store the path to the leaf nodes
`path_to_Right_Left`	`array`	Array to store the path to the leaf nodes with directions
`child_left`	`array`	Array to store the left child nodes
`child_right`	`array`	Array to store the right child nodes
`normals`	`array`	Array to store the normal vectors of the splitting hyperplanes
`intercepts`	`array`	Array to store the intercept values of the splitting hyperplanes
`node_size`	`array`	Array to store the size of the nodes
`depth`	`array`	Array to store the depth of the nodes
`corrected_depth`	`array`	Array to store the corrected depth of the nodes
`importances_right`	`array`	Array to store the importances of the right child nodes
`importances_left`	`array`	Array to store the importances of the left child nodes
`eta`	`float`	Eta value for the model. Defaults to 1.5

Source code in model_reboot/EIF_reboot.py

@jitclass(tree_spec)
class ExtendedTree:

    """
    Class that represents an Isolation Tree in the Extended Isolation Forest model.


    Attributes:
        plus (bool): Boolean flag to indicate if the model is a `EIF` or `EIF+`. Defaults to True (i.e. `EIF+`)
        locked_dims (int): Number of dimensions to be locked in the model. Defaults to 0
        max_depth (int): Maximum depth of the tree
        min_sample (int): Minimum number of samples in a node. Defaults to 1
        n (int): Number of samples in the dataset
        d (int): Number of dimensions in the dataset
        node_count (int): Counter for the number of nodes in the tree
        max_nodes (int): Maximum number of nodes in the tree. Defaults to 10000
        path_to (np.array): Array to store the path to the leaf nodes
        path_to_Right_Left (np.array): Array to store the path to the leaf nodes with directions
        child_left (np.array): Array to store the left child nodes
        child_right (np.array): Array to store the right child nodes
        normals (np.array): Array to store the normal vectors of the splitting hyperplanes
        intercepts (np.array): Array to store the intercept values of the splitting hyperplanes
        node_size (np.array): Array to store the size of the nodes
        depth (np.array): Array to store the depth of the nodes
        corrected_depth (np.array): Array to store the corrected depth of the nodes
        importances_right (np.array): Array to store the importances of the right child nodes
        importances_left (np.array): Array to store the importances of the left child nodes
        eta (float): Eta value for the model. Defaults to 1.5

    """

    def __init__(self,
                 n:int,
                 d:int,
                 max_depth:int,
                 locked_dims:int=0,
                 min_sample:int=1,
                 plus:bool=True,
                 max_nodes:int=10000,
                 eta:float=1.5):

        self.plus = plus
        self.locked_dims = locked_dims 
        self.max_depth = max_depth
        self.min_sample = min_sample
        self.n = n
        self.d = d
        self.node_count = 1
        self.max_nodes = max_nodes
        self.eta = eta

        self.path_to = -np.ones((max_nodes, max_depth+1), dtype=np.int64)
        self.path_to_Right_Left = np.zeros((max_nodes, max_depth+1), dtype=np.int64)
        self.child_left = np.zeros(max_nodes, dtype=np.int64)
        self.child_right = np.zeros(max_nodes, dtype=np.int64)
        self.normals = np.zeros((max_nodes, d), dtype=np.float64)
        self.intercepts = np.zeros(max_nodes, dtype=np.float64)
        self.node_size = np.zeros(max_nodes, dtype=np.int64)
        self.depth = np.zeros(max_nodes, dtype=np.int64)
        self.corrected_depth = np.zeros(max_nodes, dtype=np.float64)
        self.importances_right = np.zeros((max_nodes, d), dtype=np.float64)
        self.importances_left = np.zeros((max_nodes, d), dtype=np.float64)

    def fit(self, X:np.array) -> None:

        """
        Fit the model to the dataset.

        Args:
            X: Input dataset

        Returns:
            The method fits the model and does not return any value.
        """

        self.path_to[0,0] = 0
        self.extend_tree(node_id=0, X=X, depth=0)
        self.corrected_depth = np.array([
            (c_factor(k)+sum(path>-1))/c_factor(self.n)
            for i,(k,path) in enumerate(zip(self.node_size,self.path_to))
            if i<self.node_count
        ])

    def create_new_node(self,
                        parent_id:int,
                        direction:int) -> int:

        """
        Create a new node in the tree.

        Args:
            parent_id: Parent node id
            direction: Direction to the new node

        Returns:
            New node id

        """

        new_node_id = self.node_count
        self.node_count+=1
        self.path_to[new_node_id] = self.path_to[parent_id]
        self.path_to_Right_Left[new_node_id] = self.path_to_Right_Left[parent_id]
        self.path_to[new_node_id, self.depth[parent_id]+1] = new_node_id
        self.path_to_Right_Left[new_node_id, self.depth[parent_id]] = direction
        self.depth[new_node_id] = self.depth[parent_id]+1     
        return new_node_id

    def extend_tree(self,
                    node_id:int,
                    X:npt.NDArray,
                    depth: int) -> None:

        """
        Extend the tree to the given node.

        Args:
            node_id: Node id
            X: Input dataset
            depth: Depth of the node

        Returns:
            The method extends the tree and does not return any value.
        """

        stack = [(0, X, 0)] 

        while stack:
            node_id, data, depth = stack.pop()

            self.node_size[node_id] = len(data)
            if self.node_size[node_id] <= self.min_sample or depth >= self.max_depth:
                continue

            self.normals[node_id] = make_rand_vector(self.d - self.locked_dims, self.d)         

            dist = np.dot(np.ascontiguousarray(data), np.ascontiguousarray(self.normals[node_id]))

            if self.plus:
                #self.intercepts[node_id] = np.random.normal(np.mean(dist),np.std(dist)*self.eta)
                self.intercepts[node_id] = np.random.normal(np.mean(dist),np.std(dist)*self.eta)
            else:
                self.intercepts[node_id] = np.random.uniform(np.min(dist),np.max(dist))
            mask = dist <= self.intercepts[node_id]  

            X_left = data[mask]
            X_right = data[~mask,:]

            self.importances_left[node_id] = np.abs(self.normals[node_id])*(self.node_size[node_id]/(len(X_left)+1))
            self.importances_right[node_id] = np.abs(self.normals[node_id])*self.node_size[node_id]/(len(X_right)+1)

            left_child = self.create_new_node(node_id,-1)
            right_child = self.create_new_node(node_id,1)

            self.child_left[node_id] = left_child
            self.child_right[node_id] = right_child

            stack.append((left_child, X_left, depth + 1))
            stack.append((right_child, X_right, depth + 1))

            if self.node_count >= self.max_nodes:
                raise ValueError("Max number of nodes reached")

    def leaf_ids(self, X:np.array) -> np.array:
        """
        Get the leaf node ids for each data point in the dataset.

        This is a stub method of `get_leaf_ids`.

        Args:
            X: Input dataset

        Returns:
           Leaf node ids for each data point in the dataset.
        """
        return get_leaf_ids(X, self.child_left, self.child_right, self.normals, self.intercepts) 


    def apply(self, X:np.array) -> None:
        """
        Update the `path_to` attribute with the path to the leaf nodes for each data point in the dataset.

        Args:
            X: Input dataset

        Returns:
            The method updates `path_to` and does not return any value.
        """
        return self.path_to[self.leaf_ids(X)] 

    def predict(self,
                ids:np.array) -> np.array:

        """
        Predict the anomaly score for each data point in the dataset.

        Args:
            ids: Leaf node ids for each data point in the dataset.

        Returns:
            Anomaly score for each data point in the dataset.
        """
        return self.corrected_depth[ids],

    def importances(self,ids:np.array) -> tuple[np.array,np.array]:

        """
        Compute the importances of the features for the given leaf node ids.

        Args:
            ids: Leaf node ids for each data point in the dataset.

        Returns:
            Importances of the features for the given leaf node ids and the normal vectors.
        """
        importances,normals = calculate_importances(
            self.path_to[ids], 
            self.path_to_Right_Left[ids], 
            self.importances_left, 
            self.importances_right, 
            self.normals, 
            self.d
        )

        return importances, normals

`apply(X)`

Update the path_to attribute with the path to the leaf nodes for each data point in the dataset.

Parameters:

Name	Type	Description	Default
`X`	`array`	Input dataset	required

Returns:

Type	Description
`None`	The method updates `path_to` and does not return any value.

Source code in model_reboot/EIF_reboot.py

def apply(self, X:np.array) -> None:
    """
    Update the `path_to` attribute with the path to the leaf nodes for each data point in the dataset.

    Args:
        X: Input dataset

    Returns:
        The method updates `path_to` and does not return any value.
    """
    return self.path_to[self.leaf_ids(X)] 

`create_new_node(parent_id, direction)`

Create a new node in the tree.

Parameters:

Name	Type	Description	Default
`parent_id`	`int`	Parent node id	required
`direction`	`int`	Direction to the new node	required

Returns:

Type	Description
`int`	New node id

Source code in model_reboot/EIF_reboot.py

def create_new_node(self,
                    parent_id:int,
                    direction:int) -> int:

    """
    Create a new node in the tree.

    Args:
        parent_id: Parent node id
        direction: Direction to the new node

    Returns:
        New node id

    """

    new_node_id = self.node_count
    self.node_count+=1
    self.path_to[new_node_id] = self.path_to[parent_id]
    self.path_to_Right_Left[new_node_id] = self.path_to_Right_Left[parent_id]
    self.path_to[new_node_id, self.depth[parent_id]+1] = new_node_id
    self.path_to_Right_Left[new_node_id, self.depth[parent_id]] = direction
    self.depth[new_node_id] = self.depth[parent_id]+1     
    return new_node_id

`extend_tree(node_id, X, depth)`

Extend the tree to the given node.

Parameters:

Name	Type	Description	Default
`node_id`	`int`	Node id	required
`X`	`NDArray`	Input dataset	required
`depth`	`int`	Depth of the node	required

Returns:

Type	Description
`None`	The method extends the tree and does not return any value.

Source code in model_reboot/EIF_reboot.py

def extend_tree(self,
                node_id:int,
                X:npt.NDArray,
                depth: int) -> None:

    """
    Extend the tree to the given node.

    Args:
        node_id: Node id
        X: Input dataset
        depth: Depth of the node

    Returns:
        The method extends the tree and does not return any value.
    """

    stack = [(0, X, 0)] 

    while stack:
        node_id, data, depth = stack.pop()

        self.node_size[node_id] = len(data)
        if self.node_size[node_id] <= self.min_sample or depth >= self.max_depth:
            continue

        self.normals[node_id] = make_rand_vector(self.d - self.locked_dims, self.d)         

        dist = np.dot(np.ascontiguousarray(data), np.ascontiguousarray(self.normals[node_id]))

        if self.plus:
            #self.intercepts[node_id] = np.random.normal(np.mean(dist),np.std(dist)*self.eta)
            self.intercepts[node_id] = np.random.normal(np.mean(dist),np.std(dist)*self.eta)
        else:
            self.intercepts[node_id] = np.random.uniform(np.min(dist),np.max(dist))
        mask = dist <= self.intercepts[node_id]  

        X_left = data[mask]
        X_right = data[~mask,:]

        self.importances_left[node_id] = np.abs(self.normals[node_id])*(self.node_size[node_id]/(len(X_left)+1))
        self.importances_right[node_id] = np.abs(self.normals[node_id])*self.node_size[node_id]/(len(X_right)+1)

        left_child = self.create_new_node(node_id,-1)
        right_child = self.create_new_node(node_id,1)

        self.child_left[node_id] = left_child
        self.child_right[node_id] = right_child

        stack.append((left_child, X_left, depth + 1))
        stack.append((right_child, X_right, depth + 1))

        if self.node_count >= self.max_nodes:
            raise ValueError("Max number of nodes reached")

`fit(X)`

Fit the model to the dataset.

Parameters:

Name	Type	Description	Default
`X`	`array`	Input dataset	required

Returns:

Type	Description
`None`	The method fits the model and does not return any value.

Source code in model_reboot/EIF_reboot.py

def fit(self, X:np.array) -> None:

    """
    Fit the model to the dataset.

    Args:
        X: Input dataset

    Returns:
        The method fits the model and does not return any value.
    """

    self.path_to[0,0] = 0
    self.extend_tree(node_id=0, X=X, depth=0)
    self.corrected_depth = np.array([
        (c_factor(k)+sum(path>-1))/c_factor(self.n)
        for i,(k,path) in enumerate(zip(self.node_size,self.path_to))
        if i<self.node_count
    ])

`importances(ids)`

Compute the importances of the features for the given leaf node ids.

Parameters:

Name	Type	Description	Default
`ids`	`array`	Leaf node ids for each data point in the dataset.	required

Returns:

Type	Description
`tuple[array, array]`	Importances of the features for the given leaf node ids and the normal vectors.

Source code in model_reboot/EIF_reboot.py

def importances(self,ids:np.array) -> tuple[np.array,np.array]:

    """
    Compute the importances of the features for the given leaf node ids.

    Args:
        ids: Leaf node ids for each data point in the dataset.

    Returns:
        Importances of the features for the given leaf node ids and the normal vectors.
    """
    importances,normals = calculate_importances(
        self.path_to[ids], 
        self.path_to_Right_Left[ids], 
        self.importances_left, 
        self.importances_right, 
        self.normals, 
        self.d
    )

    return importances, normals

`leaf_ids(X)`

Get the leaf node ids for each data point in the dataset.

This is a stub method of get_leaf_ids.

Parameters:

Name	Type	Description	Default
`X`	`array`	Input dataset	required

Returns:

Type	Description
`array`	Leaf node ids for each data point in the dataset.

Source code in model_reboot/EIF_reboot.py

def leaf_ids(self, X:np.array) -> np.array:
    """
    Get the leaf node ids for each data point in the dataset.

    This is a stub method of `get_leaf_ids`.

    Args:
        X: Input dataset

    Returns:
       Leaf node ids for each data point in the dataset.
    """
    return get_leaf_ids(X, self.child_left, self.child_right, self.normals, self.intercepts) 

`predict(ids)`

Predict the anomaly score for each data point in the dataset.

Parameters:

Name	Type	Description	Default
`ids`	`array`	Leaf node ids for each data point in the dataset.	required

Returns:

Type	Description
`array`	Anomaly score for each data point in the dataset.

Source code in model_reboot/EIF_reboot.py

def predict(self,
            ids:np.array) -> np.array:

    """
    Predict the anomaly score for each data point in the dataset.

    Args:
        ids: Leaf node ids for each data point in the dataset.

    Returns:
        Anomaly score for each data point in the dataset.
    """
    return self.corrected_depth[ids],

`IsolationForest`

Bases: ExtendedIsolationForest

Class that represents the Isolation Forest model.

This is a subclass of ExtendedIsolationForest with the plus attribute set to False and the locked_dims attribute set to the number of dimensions minus one.

Attributes:

Name	Type	Description
`n_estimators`	`int`	Number of trees in the model. Defaults to 400
`max_depth`	`Union[str, int]`	Maximum depth of the trees. Defaults to "auto"
`max_samples`	`Union[str, int]`	Maximum number of samples in a node. Defaults to "auto"

Source code in model_reboot/EIF_reboot.py

class IsolationForest(ExtendedIsolationForest):

    """
    Class that represents the Isolation Forest model. 

    This is a subclass of `ExtendedIsolationForest` with the `plus` attribute set to False and the 
    `locked_dims` attribute set to the number of dimensions minus one.

    Attributes:
        n_estimators (int): Number of trees in the model. Defaults to 400
        max_depth (Union[str,int]): Maximum depth of the trees. Defaults to "auto"
        max_samples (Union[str,int]): Maximum number of samples in a node. Defaults to "auto"

    """
    def __init__(self,
                 n_estimators:int=400,
                 max_depth:Union[str,int]="auto",
                 max_samples:Union[str,int]="auto") -> None:
        super().__init__(plus=False,n_estimators=n_estimators,max_depth=max_depth,max_samples=max_samples)
        self.name="IF"

    def fit(self,
            X:np.array) -> None:

        """
        Fit the model to the dataset.

        Args:
            X: Input dataset

        Returns:
            The method fits the model and does not return any value.
        """

        return super().fit(X, locked_dims=X.shape[1]-1)

    def decision_function_single_tree(self,
                                      tree_idx:int,
                                      X:np.array,
                                      p:float=0.1) -> tuple[np.array,np.array]:

        """
        Predict the anomaly score for each data point in the dataset using a single tree.

        Args:
            tree_idx: Index of the tree
            X: Input dataset
            p: Proportion of outliers (i.e. threshold for the anomaly score). Defaults to 0.1

        Returns:
            Anomaly score for each data point in the dataset and the predicted class for each data point in the dataset.
        """

        self.compute_ids(X)
        pred=self.trees[tree_idx].predict(X,self.ids[tree_idx])[0]
        score=np.power(2,-pred)
        y_hat = np.array(score > sorted(score,reverse=True)[int(p*len(score))],dtype=int)
        return score,y_hat

`decision_function_single_tree(tree_idx, X, p=0.1)`

Predict the anomaly score for each data point in the dataset using a single tree.

Parameters:

Name	Type	Description	Default
`tree_idx`	`int`	Index of the tree	required
`X`	`array`	Input dataset	required
`p`	`float`	Proportion of outliers (i.e. threshold for the anomaly score). Defaults to 0.1	`0.1`

Returns:

Type	Description
`tuple[array, array]`	Anomaly score for each data point in the dataset and the predicted class for each data point in the dataset.

Source code in model_reboot/EIF_reboot.py

def decision_function_single_tree(self,
                                  tree_idx:int,
                                  X:np.array,
                                  p:float=0.1) -> tuple[np.array,np.array]:

    """
    Predict the anomaly score for each data point in the dataset using a single tree.

    Args:
        tree_idx: Index of the tree
        X: Input dataset
        p: Proportion of outliers (i.e. threshold for the anomaly score). Defaults to 0.1

    Returns:
        Anomaly score for each data point in the dataset and the predicted class for each data point in the dataset.
    """

    self.compute_ids(X)
    pred=self.trees[tree_idx].predict(X,self.ids[tree_idx])[0]
    score=np.power(2,-pred)
    y_hat = np.array(score > sorted(score,reverse=True)[int(p*len(score))],dtype=int)
    return score,y_hat

`fit(X)`

Fit the model to the dataset.

Parameters:

Name	Type	Description	Default
`X`	`array`	Input dataset	required

Returns:

Type	Description
`None`	The method fits the model and does not return any value.

Source code in model_reboot/EIF_reboot.py

def fit(self,
        X:np.array) -> None:

    """
    Fit the model to the dataset.

    Args:
        X: Input dataset

    Returns:
        The method fits the model and does not return any value.
    """

    return super().fit(X, locked_dims=X.shape[1]-1)

`c_factor(n)`

Average path length of unsuccesful search in a binary search tree given n points. This is a constant factor that will be used as a normalization factor in the Anomaly Score computation.

Parameters:

Name	Type	Description	Default
`n`	`int`	Number of data points for the BST.	required

Returns:

Type	Description
`float`	Average path length of unsuccesful search in a BST

Source code in model_reboot/EIF_reboot.py

@njit(cache=True)
def c_factor(n: int) -> float:
    """
    Average path length of unsuccesful search in a binary search tree given n points.
    This is a constant factor that will be used as a normalization factor in the Anomaly Score computation.

    Args:
        n: Number of data points for the BST.

    Returns:
        Average path length of unsuccesful search in a BST

    """
    if n <=1: return 0
    return 2.0*(np.log(n-1)+0.5772156649) - (2.0*(n-1.)/(n*1.0))

`calculate_importances(paths, directions, importances_left, importances_right, normals, d)`

Calculate the importances of the features for the given paths and directions.

Parameters:

Name	Type	Description	Default
`paths`	`ndarray`	Paths to the leaf nodes	required
`directions`	`ndarray`	Directions to the leaf nodes	required
`importances_left`	`ndarray`	Importances of the left child nodes	required
`importances_right`	`ndarray`	Importances of the right child nodes	required
`normals`	`ndarray`	Normal vectors of the splitting hyperplanes	required
`d`	`int`	Number of dimensions in the dataset	required

Returns:

Type	Description
`tuple[array, array]`	Importances of the features for the given paths and directions and the normal vectors.

Source code in model_reboot/EIF_reboot.py

@njit(cache=True)
def calculate_importances(paths:np.ndarray,
                          directions:np.ndarray, 
                          importances_left:np.ndarray, 
                          importances_right:np.ndarray, 
                          normals:np.ndarray,
                          d:int) -> tuple[np.array,np.array]:

    """
    Calculate the importances of the features for the given paths and directions.

    Args:
        paths: Paths to the leaf nodes
        directions: Directions to the leaf nodes
        importances_left: Importances of the left child nodes
        importances_right: Importances of the right child nodes
        normals: Normal vectors of the splitting hyperplanes
        d: Number of dimensions in the dataset

    Returns:
        Importances of the features for the given paths and directions and the normal vectors.
    """

    # Flatten the paths and directions for 1D boolean indexing
    paths_flat = paths.flatten()
    directions_flat = directions.flatten()

    # Create masks for left and right directions
    left_mask_flat = directions_flat == -1
    right_mask_flat = directions_flat == 1

    # Use masks to filter flattened paths; initialize with -1 (or suitable default)
    left_paths_flat = np.full_like(paths_flat, -1)
    right_paths_flat = np.full_like(paths_flat, -1)

    # Apply the masks
    left_paths_flat[left_mask_flat] = paths_flat[left_mask_flat]
    right_paths_flat[right_mask_flat] = paths_flat[right_mask_flat]

    # Since importances are mentioned to be arrays of arrays, let's assume we can index them directly with the flattened paths
    # Note: This step might need adjustment based on the actual structure and intended calculations
    importances_sum_left = np.zeros((paths.shape[0],d), dtype=np.float64)  # Initialize to match number of rows in paths
    importances_sum_right = np.zeros((paths.shape[0],d), dtype=np.float64)

    normals_sum = np.zeros((paths.shape[0],d), dtype=np.float64)  # Initialize to match number of rows in paths

    importances_sum_left = importances_left[left_paths_flat].reshape(paths.shape[0],paths.shape[1],d).sum(axis=1)
    importances_sum_right = importances_right[right_paths_flat].reshape(paths.shape[0],paths.shape[1],d).sum(axis=1)
    normals_sum = np.abs(normals[paths_flat]).reshape(paths.shape[0],paths.shape[1],d).sum(axis=1)

    importances_sum = importances_sum_left + importances_sum_right

    return importances_sum, normals_sum

`get_leaf_ids(X, child_left, child_right, normals, intercepts)`

Get the leaf node ids for each data point in the dataset.

Parameters:

Name	Type	Description	Default
`X`	`array`	Data points	required
`child_left`	`array`	Left child node ids	required
`child_right`	`array`	Right child node ids	required
`normals`	`array`	Normal vectors of the splitting hyperplanes	required
`intercepts`	`array`	Intercept values of the splitting hyperplanes	required

Returns:

Type	Description
`array`	Leaf node ids for each data point in the dataset.

Source code in model_reboot/EIF_reboot.py

@njit(cache=True)
def get_leaf_ids(X:np.array,
                 child_left:np.array,
                 child_right:np.array,
                 normals:np.array,
                 intercepts:np.array) -> np.array:

        """
        Get the leaf node ids for each data point in the dataset.

        Args:
            X: Data points
            child_left: Left child node ids
            child_right: Right child node ids
            normals: Normal vectors of the splitting hyperplanes
            intercepts: Intercept values of the splitting hyperplanes

        Returns:
           Leaf node ids for each data point in the dataset.
        """
        res = []
        for x in X:
            node_id = 0
            while child_left[node_id] or child_right[node_id]:
                d = np.dot(np.ascontiguousarray(x),np.ascontiguousarray(normals[node_id]))
                node_id = child_left[node_id] if d <= intercepts[node_id] else child_right[node_id]        
            res.append(int(node_id))
        return np.array(res)

`make_rand_vector(df, dimensions)`

Generate a random unitary vector in the unit ball with a maximum number of dimensions. This vector will be successively used in the generation of the splitting hyperplanes.

Parameters:

Name	Type	Description	Default
`df`	`int`	Degrees of freedom	required
`dimensions`	`int`	number of dimensions of the feature space	required

Raises:

Type	Description
`ValueError`	If the degree of freedom does not match with the dataset dimensions

Returns:

Name	Type	Description
`vec`	`NDArray[float64]`	Random unitary vector in the unit ball

Source code in model_reboot/EIF_reboot.py

@njit(cache=True)
def make_rand_vector(df:int,
                     dimensions:int) -> npt.NDArray[np.float64]:
    """
    Generate a random unitary vector in the unit ball with a maximum number of dimensions. 
    This vector will be successively used in the generation of the splitting hyperplanes.

    Args:
        df: Degrees of freedom
        dimensions: number of dimensions of the feature space

    Raises:
        ValueError: If the degree of freedom does not match with the dataset dimensions

    Returns:
        vec: Random unitary vector in the unit ball

    """
    if dimensions<df:
        raise ValueError("degree of freedom does not match with dataset dimensions")
    else:
        vec_ = np.random.normal(loc=0.0, scale=1.0, size=df)
        indexes = np.random.choice(np.arange(dimensions),df,replace=False)
        vec = np.zeros(dimensions)
        vec[indexes] = vec_
        vec=vec/np.linalg.norm(vec)
    return vec

Models

ExtendedIsolationForest

avg_number_of_nodes: float property

compute_ids(X)

fit(X, locked_dims=None)

global_importances(X, p=0.1)

local_importances(X)

predict(X)

ExtendedTree

apply(X)

create_new_node(parent_id, direction)

extend_tree(node_id, X, depth)

fit(X)

importances(ids)

leaf_ids(X)

predict(ids)

IsolationForest

decision_function_single_tree(tree_idx, X, p=0.1)

fit(X)

c_factor(n)

calculate_importances(paths, directions, importances_left, importances_right, normals, d)

get_leaf_ids(X, child_left, child_right, normals, intercepts)

make_rand_vector(df, dimensions)

`ExtendedIsolationForest`

`avg_number_of_nodes: float` `property`

`compute_ids(X)`

`fit(X, locked_dims=None)`

`global_importances(X, p=0.1)`

`local_importances(X)`

`predict(X)`

`ExtendedTree`

`apply(X)`

`create_new_node(parent_id, direction)`

`extend_tree(node_id, X, depth)`

`fit(X)`

`importances(ids)`

`leaf_ids(X)`

`predict(ids)`

`IsolationForest`

`decision_function_single_tree(tree_idx, X, p=0.1)`

`fit(X)`

`c_factor(n)`

`calculate_importances(paths, directions, importances_left, importances_right, normals, d)`

`get_leaf_ids(X, child_left, child_right, normals, intercepts)`

`make_rand_vector(df, dimensions)`