%matplotlib inline

Figure 9.14¶

by W. Trick & M. Walther

"""
Photometric Redshifts by Random Forests
---------------------------------------
Figure 9.15

Photometric redshift estimation using random forest regression, with ten random
trees. Comparison to figure 9.14 shows that random forests correct for the
overfitting evident in very deep decision trees. Here the optimal depth is 20
or above, and a much better cross-validation error is achieved.
"""
# Author: Jake VanderPlas
# License: BSD
#   The figure produced by this code is published in the textbook
#   "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
#   For more information, see http://astroML.github.com
#   To report a bug or issue, use the following forum:
#    https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from matplotlib import pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from astroML.datasets import fetch_sdss_specgals
from astroML.decorators import pickle_results

#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX.  This may
# result in an error if LaTeX is not installed on your system.  In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)

#------------------------------------------------------------
# Fetch and prepare the data
data = fetch_sdss_specgals()

# put magnitudes in a matrix
mag = np.vstack([data['modelMag_%s' % f] for f in 'ugriz']).T
z = data['z']

# train on ~60,000 points
mag_train = mag[::10]
z_train = z[::10]

# test on ~6,000 distinct points
mag_test = mag[1::100]
z_test = z[1::100]


#------------------------------------------------------------
# Compute the results
#  This is a long computation, so we'll save the results to a pickle.
@pickle_results('photoz_forest.pkl')
def compute_photoz_forest(depth,nest):
    rms_test = np.zeros(len(depth))
    rms_train = np.zeros(len(depth))
    i_best = 0
    z_fit_best = None
    
    z_fit_arr=[]
    
    for i, d in enumerate(depth):
        clf = RandomForestRegressor(n_estimators=nest,
                                    max_depth=d, random_state=0,n_jobs=-1)
        clf.fit(mag_train, z_train)

        z_fit_train = clf.predict(mag_train)
        z_fit = clf.predict(mag_test)
        rms_train[i] = np.sqrt(np.mean((z_fit_train - z_train) ** 2))
        rms_test[i] = np.sqrt(np.mean((z_fit - z_test) ** 2))
        score=clf.score(mag_test,z_test)
        z_fit_arr.append(z_fit)
        
        if rms_test[i] <= rms_test[i_best]:
            i_best = i
            z_fit_best = z_fit
                   
    return rms_test, rms_train, i_best, z_fit_best, z_fit_arr, score

@pickle_results('photoz_forest_nest.pkl')
def compute_photoz_forest_nest(depth,nest):
    rms_test = np.zeros(len(nest))
    rms_train = np.zeros(len(nest))
    i_best = 0
    z_fit_best = None
    
    z_fit_arr=[]
    
    for i, n in enumerate(nest):
        clf = RandomForestRegressor(n_estimators=n,
                                    max_depth=depth, random_state=0,n_jobs=-1)
        clf.fit(mag_train, z_train)

        z_fit_train = clf.predict(mag_train)
        z_fit = clf.predict(mag_test)
        rms_train[i] = np.sqrt(np.mean((z_fit_train - z_train) ** 2))
        rms_test[i] = np.sqrt(np.mean((z_fit - z_test) ** 2))
        
        z_fit_arr.append(z_fit)
        
        if rms_test[i] <= rms_test[i_best]:
            i_best = i
            z_fit_best = z_fit
                   
    return rms_test, rms_train, i_best, z_fit_best, z_fit_arr, score

depth = np.arange(1, 21)
nest = np.arange(1,11)
rms_test, rms_train, i_best, z_fit_best, z_fit_arr, score = compute_photoz_forest(depth,10)
best_depth = depth[i_best]
rms_test_nest, rms_train_nest, i_best_nest, z_fit_best_nest, z_fit_arr_nest, score_nest = compute_photoz_forest_nest(best_depth,nest)

@pickle_results: computing results and saving to 'photoz_forest.pkl'
@pickle_results: computing results and saving to 'photoz_forest_nest.pkl'

Plot figures as in the book¶

#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(15, 5))
fig.subplots_adjust(wspace=0.25,
                    left=0.1, right=0.95,
                    bottom=0.15, top=0.9)

# left panel: plot cross-validation results
ax = fig.add_subplot(121)
ax.plot(depth, rms_test, '-k', label='cross-validation')
ax.plot(depth, rms_train, '--k', label='training set')
ax.legend(loc=1)

ax.set_xlabel('depth of tree')
ax.set_ylabel('rms error')

ax.set_xlim(0, 21)
ax.set_ylim(0.009,  0.04)
ax.yaxis.set_major_locator(plt.MultipleLocator(0.01))

# right panel: plot best fit
ax = fig.add_subplot(122)
ax.scatter(z_test, z_fit_best, s=1, lw=0, c='k')
ax.plot([-0.1, 0.4], [-0.1, 0.4], ':k')
ax.text(0.03, 0.97, "depth = %i\nrms = %.3f" % (best_depth, rms_test[i_best]),
        ha='left', va='top', transform=ax.transAxes)

ax.set_xlabel(r'$z_{\rm true}$')
ax.set_ylabel(r'$z_{\rm fit}$')

ax.set_xlim(-0.02, 0.4001)
ax.set_ylim(-0.02, 0.4001)
ax.xaxis.set_major_locator(plt.MultipleLocator(0.1))
ax.yaxis.set_major_locator(plt.MultipleLocator(0.1))

produce sliders for depth (with fixed number of trees)¶

def multiplot(depth):
    z_fit=z_fit_arr[depth-1]
    plt.figure(figsize=(15, 10))
    ax = plt.gca()
    ax.scatter(z_test, z_fit, s=1, lw=0, c='k')
    ax.plot([-0.1, 0.4], [-0.1, 0.4], ':k')
    ax.text(0.03, 0.97, "depth = %i\nrms = %.3f" % (depth, rms_test[depth-1]),
            ha='left', va='top', transform=ax.transAxes)

    ax.set_xlabel(r'$z_{\rm true}$')
    ax.set_ylabel(r'$z_{\rm fit}$')

    ax.set_xlim(-0.02, 0.4001)
    ax.set_ylim(-0.02, 0.4001)
    ax.xaxis.set_major_locator(plt.MultipleLocator(0.1))
    ax.yaxis.set_major_locator(plt.MultipleLocator(0.1))

from IPython.html.widgets import interact
interact(multiplot,depth=(1,20,1))

<function __main__.multiplot>

Produce plots for various number of trees in the Random forest (with fixed depth)¶

#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(15, 5))
fig.subplots_adjust(wspace=0.25,
                    left=0.1, right=0.95,
                    bottom=0.15, top=0.9)

# left panel: plot cross-validation results
ax = fig.add_subplot(121)
ax.plot(nest, rms_test_nest, '-k', label='cross-validation')
ax.plot(nest, rms_train_nest, '--k', label='training set')
ax.legend(loc=1)

ax.set_xlabel('number of estimators')
ax.set_ylabel('rms error')

ax.set_xlim(0, 10)
ax.set_ylim(0.009,  0.04)
ax.yaxis.set_major_locator(plt.MultipleLocator(0.01))

# right panel: plot best fit
ax = fig.add_subplot(122)
ax.scatter(z_test, z_fit_best, s=1, lw=0, c='k')
ax.plot([-0.1, 0.4], [-0.1, 0.4], ':k')

ax.set_xlabel(r'$z_{\rm true}$')
ax.set_ylabel(r'$z_{\rm fit}$')

ax.set_xlim(-0.02, 0.4001)
ax.set_ylim(-0.02, 0.4001)
ax.xaxis.set_major_locator(plt.MultipleLocator(0.1))
ax.yaxis.set_major_locator(plt.MultipleLocator(0.1))

def multiplot(nest):
    z_fit=z_fit_arr_nest[nest-1]
    plt.figure(figsize=(15, 10))
    ax = plt.gca()
    ax.scatter(z_test, z_fit, s=1, lw=0, c='k')
    ax.plot([-0.1, 0.4], [-0.1, 0.4], ':k')
    ax.text(0.03, 0.97, "nest = %i\nrms = %.3f" % (nest, rms_test_nest[nest-1]),
            ha='left', va='top', transform=ax.transAxes)

    ax.set_xlabel(r'$z_{\rm true}$')
    ax.set_ylabel(r'$z_{\rm fit}$')

    ax.set_xlim(-0.02, 0.4001)
    ax.set_ylim(-0.02, 0.4001)
    ax.xaxis.set_major_locator(plt.MultipleLocator(0.1))
    ax.yaxis.set_major_locator(plt.MultipleLocator(0.1))

interact(multiplot,nest=(1,10))