Decision Tree


DSCI 571 - Supervised Learning I

Use case:


What is it?

  • For classification: ???
    • At each node, the algorithm learns/decides
      • which feature is the most useful for classification at that point
      • what threshold to use
      • objective: to reduce impurity at each node
  • For regression: ???


define display_tree
# Reference: DSCI_571_sup-learn-1/lectures/code/
import re 
import graphviz

from sklearn.tree import export_graphviz

def display_tree(feature_names, tree, counts=False):
    """ For binary classification only """
    dot = export_graphviz(
    # adapted from
    # dot = re.sub('(\\\\nsamples = [0-9]+)(\\\\nvalue = \[[0-9]+, [0-9]+\])(\\\\nclass = [A-Za-z0-9]+)', '', dot)
    if counts: 
        dot = re.sub("(samples = [0-9]+)\\\\n", "", dot)
        dot = re.sub("value", "counts", dot)
        dot = re.sub("(\\\\nsamples = [0-9]+)(\\\\nvalue = \[[0-9]+, [0-9]+\])", "", dot)
        dot = re.sub("(samples = [0-9]+)(\\\\nvalue = \[[0-9]+, [0-9]+\])\\\\n", "", dot)

    return graphviz.Source(dot)
define plot_tree_decision_boundary_and_tree
# Reference: DSCI_571_sup-learn-1_students/lectures/code/
import matplotlib.pyplot as plt
import mglearn
from sklearn.tree import plot_tree

# Custom function to customize the tree plot and hide values and samples
def custom_plot_tree(tree_model, feature_names=None, class_names=None, **kwargs):
    Customizes and displays a tree plot for a scikit-learn Decision Tree Classifier.

    - tree (sklearn.tree.DecisionTreeClassifier): The trained Decision Tree Classifier to visualize.
    - width: width of the matplotlib plot in inches 
    - height: height of the matplotlib plot in inches 
    - feature_names (list or None): A list of feature names to label the tree nodes with feature names.
                                    If None, generic feature names will be used.
    - class_names (list or None): A list of class names to label the tree nodes with class names.
                                  If None, generic class names will be used.
    - **kwargs: Additional keyword arguments to be passed to the `sklearn.tree.plot_tree` function.

    - None: The function displays the customized tree plot using Matplotlib.
    This function customizes the appearance of a Decision Tree plot generated by the scikit-learn
    `plot_tree` function. It hides both the samples and values in each node of the tree plot
    for improved visualization.
    # Customize the appearance of the text elements for each node
    for text in plt.gca().texts:
        new_text = re.sub('samples = \d+\n', '', text.get_text()) # Hide samples

def plot_tree_decision_boundary(
    model, X, y, x_label="x-axis", y_label="y-axis", eps=None, ax=None, title=None
    if ax is None:
        ax = plt.gca()

    if title is None:
        title = "max_depth=%d" % (model.tree_.max_depth)

        model, X.to_numpy(), eps=eps, fill=True, alpha=0.5, ax=ax
    mglearn.discrete_scatter(X.iloc[:, 0], X.iloc[:, 1], y, ax=ax)

def plot_tree_decision_boundary_and_tree(
    model, X, y, height=6, width=16, fontsize = 9, x_label="x-axis", y_label="y-axis", eps=None
    fig, ax = plt.subplots(
        figsize=(width, height),
        subplot_kw={"xticks": (), "yticks": ()},
        gridspec_kw={"width_ratios": [1.5, 2]},
    plot_tree_decision_boundary(model, X, y, x_label, y_label, eps, ax=ax[0])
                 class_names=['A+', 'not A+'],
                 fontsize=fontsize, ax=ax[1])


read df
import pandas as pd

df = pd.read_csv("data/quiz2-grade-toy-classification.csv")
ml_experience class_attendance lab1 lab2 lab3 lab4 quiz1 quiz2
0 1 1 92 93 84 91 92 A+
1 1 0 94 90 80 83 91 not A+
2 0 0 78 85 83 80 80 not A+
3 0 1 91 94 92 91 89 A+
4 0 1 77 83 90 92 85 A+
from sklearn.tree import DecisionTreeClassifier

y, X = df.pop("quiz2"), df

clf = DecisionTreeClassifier(), y)
array(['A+', 'not A+', 'not A+', 'A+', 'A+', 'not A+', 'A+', 'not A+',
       'not A+', 'not A+', 'A+', 'A+', 'A+', 'A+', 'not A+', 'not A+',
       'A+', 'not A+', 'not A+', 'not A+', 'A+'], dtype=object)
clf.score(X, y) # accuracy
display_tree(X.columns, clf)

Decision stump

from sklearn.tree import DecisionTreeClassifier

X = df[['lab4', 'quiz1']]

clf = DecisionTreeClassifier(max_depth=1), y)

plot_tree_decision_boundary_and_tree(clf, X, y, x_label='lab4', y_label='quiz1')
read df
import pandas as pd

# Prepare data
df = pd.read_csv("data/quiz2-grade-toy-regression.csv")
ml_experience class_attendance lab1 lab2 lab3 lab4 quiz1 quiz2
0 1 1 92 93 84 91 92 90
1 1 0 94 90 80 83 91 84
2 0 0 78 85 83 80 80 82
3 0 1 91 94 92 91 89 92
4 0 1 77 83 90 92 85 90
from sklearn.tree import DecisionTreeRegressor

y, X = df.pop("quiz2"), df

reg = DummyRegressor(strategy="mean"), y)
array([86.28571429, 86.28571429, 86.28571429, 86.28571429, 86.28571429,
       86.28571429, 86.28571429])
reg.score(X, y) # R^2 (it can be -ve, which is worse than DummyRegressor)


  • criterion for minimizing impurity
    • (DecisionTreeClassifier) Default: gini
      • gini: gini index
      • entropy: cross entropy
      • log_loss: information gain
    • (DecisionTreeRegressor) Default: squared_error
      • {squared_error, friedman_mse, absolute_error, poisson}
  • max_depth, maximum tree depth. Default: None
    • If None, the decision tree could be creating very specific rules, based on just one example from the data
    • If max_depth = 1, the tree is called Decision stump
  • min_samples_split
  • min_samples_leaf
  • max_leaf_nodes





