Source code for id3.id3

import numpy as np
import numbers
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split

from .tree import TreeBuilder, Tree
from .splitter import Splitter
from .utils import check_numerical_array, ExtendedLabelEncoder


[docs]class Id3Estimator(BaseEstimator): """A decision tree estimator for deriving ID3 decision trees. Parameters ---------- max_depth : int, optional max depth of features. min_samples_split : int, optional (default=2) min samples to split on. prune : bool, optional (default=False) set to True to prune the tree. gain_ratio : bool, optional (default=False) use gain ratio on split calculations. is_repeating: bool, optional (default=False) use repeating features. Attributes ---------- max_depth : int min_samples_split : int prune : bool gain_ratio : bool min_entropy_decrease : float is_repeating : bool """ def __init__(self, max_depth=None, min_samples_split=2, prune=False, gain_ratio=False, min_entropy_decrease=0.0, is_repeating=False): self.max_depth = max_depth self.min_samples_split = min_samples_split self.prune = prune self.gain_ratio = gain_ratio self.min_entropy_decrease = min_entropy_decrease self.is_repeating = is_repeating
[docs] def fit(self, X, y, check_input=True): """Build a decision tree based on samples X and corresponding classifications y. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). check_input : bool (default=True) check if the input for numerical features Attributes ---------- n_features_ : int The number of features when ``fit`` is performed. X_encoders_ : list List of LabelEncoders that transforms input from labels to binary encodings and vice versa. y_encoder_ : LabelEncoder LabelEncoders that transforms output from labels to binary encodings and vice versa. is_numerical_ : bool array of size [n_features] Array flagging which features that are asumed to be numerical builder_ : TreeBuilder Instance of the tree builder tree_ : Tree Instance of the build tree Returns ------- self : object Returns self. """ X_, y_ = check_X_y(X, y) self.y_encoder_ = ExtendedLabelEncoder() y_ = self.y_encoder_.fit_transform(y_) max_np_int = np.iinfo(np.int32).max if not isinstance(self.max_depth, (numbers.Integral, np.integer)): max_depth = max_np_int else: max_depth = self.max_depth if isinstance(self.min_samples_split, (numbers.Integral, np.integer)): min_samples_split = (1 if self.min_samples_split < 1 else self.min_samples_split) else: min_samples_split = 1 if isinstance(self.min_entropy_decrease, (np.float, np.integer)): min_entropy_decrease = (0 if self.min_entropy_decrease < 0 else self.min_entropy_decrease) else: min_entropy_decrease = 0 _, self.n_features_ = X_.shape self.is_numerical_ = [False] * self.n_features_ X_tmp = np.zeros(X_.shape, dtype=np.float32) self.X_encoders_ = [ExtendedLabelEncoder() for _ in range(self.n_features_)] for i in range(self.n_features_): if check_input and check_numerical_array(X_[:, i]): self.is_numerical_[i] = True X_tmp[:, i] = X_[:, i] else: X_tmp[:, i] = self.X_encoders_[i].fit_transform(X_[:, i]) X_ = X_tmp if self.prune: X_, X_test, y_, y_test = train_test_split(X_, y_, test_size=0.3) splitter = Splitter(X_, y_, self.is_numerical_, self.X_encoders_, self.gain_ratio) self.builder_ = TreeBuilder(splitter, self.y_encoder_, X_.shape[0], self.n_features_, self.is_numerical_, max_depth=max_depth, min_samples_split=min_samples_split, min_entropy_decrease=min_entropy_decrease, prune=self.prune, is_repeating=self.is_repeating) self.tree_ = Tree(X_encoders=self.X_encoders_, y_encoder=self.y_encoder_) if self.prune: self.builder_.build(self.tree_, X_, y_, X_test, y_test) else: self.builder_.build(self.tree_, X_, y_) return self
[docs] def predict(self, X): """Predict class for every sample in X. Parameters ---------- X : array-like of shape = [n_samples, n_features_idx] The input samples. Returns ------- y : array of shape = [n_samples] """ check_is_fitted(self, 'tree_') X = check_array(X) n_features = X.shape[1] if n_features != self.n_features_: raise ValueError("Number of features of the model must " "match the input. Model n_features is {} and " "input n_features is {}." .format(self.n_features_, n_features)) X_ = np.empty(X.shape) for i in range(self.n_features_): if self.is_numerical_[i]: X_[:, i] = X[:, i] else: try: X_[:, i] = self.X_encoders_[i].transform(X[:, i]) except ValueError as e: raise ValueError('New attribute value not found in ' 'train data.') y = self.builder_._predict(self.tree_, X_) return self.y_encoder_.inverse_transform(y)
[docs] def predict_proba(self, X): """Predict class probabilities for every sample in X. Parameters ---------- X : array-like of shape = [n_samples, n_features_idx] The input samples. Returns ------- y : array of shape = [n_samples, n_classes] """ check_is_fitted(self, 'tree_') X = check_array(X) n_features = X.shape[1] if n_features != self.n_features_: raise ValueError("Number of features of the model must " "match the input. Model n_features is {} and " "input n_features is {}." .format(self.n_features_, n_features)) X_ = np.empty(X.shape) for i in range(self.n_features_): if self.is_numerical_[i]: X_[:, i] = X[:, i] else: try: X_[:, i] = self.X_encoders_[i].transform(X[:, i]) except ValueError as e: raise ValueError('New attribute value not found in ' 'train data.') y = self.builder_._predict_proba(self.tree_, X_) return y