Wednesday, May 3, 2017

Feature Filtering with Natural B-Splines for Functional Modeling Application


feature.filtering.ns <- function(X, df = 10, graph = FALSE)
{ 
    ## Purpose: Feature Filtering with Natural B-Splines for Functional Modeling Application.
    ##          Preprocess the feature data (design matrix) to enforce a natural regularization.
    ##          In other words, project each observation to each of "df" axis represented by B-spline Basis. 
    ## Arguments:
    ##   X: a design matrix (predictors) of dimension N x p (N observations, p predictors).
    ##      If "X" is a vector, convert it to a matrix with one row. 
    ##   df: Degrees of Freedom (number of natural B-splines basis functions to approximate the (continuous) coefficient function.
    ##       Beta (p x 1) = H (p x df) Theta (df x 1), where H represents "df" number of continuous basis functions
    ##       in the coefficient space, and Theta represents the linear combination parameters for constructing Beta.
    ##   graph: if TRUE, plot each observation (each row of X) as a time series,
    ##          its projection to the reduced space (formed by the natural B-spline basis),
    ##          and the natural B-spline basis. 
    ## Return: Preprocessed Features (X H) of dimension N x df, which can be used as "df" input into a predictive model.
    ##         X Beta = X H Theta = (X H) Theta
    ## Author: Feiming Chen, Date:  3 May 2017, 12:12
    ## ________________________________________________

    if (is.vector(X)) X <- matrix(X, nrow = 1)

    p <- ncol(X)                        # number of predictors
    require(splines)
    ## B-splines for Natural Cubic Spline: a p x df  matrix.
    H <- ns(1:p, df=df)   

    ## Filtered Features.  Dimensions: N x p  p x df = N x df
    ## Project each observation "x" into one of "df" Basis (each column of H matrix) so that
    ## each "x" is represented by "df" coefficient (coordinates in the coordinate system defined by "H")
    ans <- X %*% H                             

    if (graph) {
        par(mfrow=c(3,1))
        matplot(t(X), type = "b", xlab = "Feature Index", ylab = "Observation Value",
                main = "Original Features (High-Dimensional Space)")
        matplot(H, type="b", xlab = "Index", ylab ="Value", main="Natural B-spline Basis")
        matplot(t(ans), type = "b", xlab = "Feature Index", ylab = "Observation Value",
                main = "Filtered Features (Low-Dimensional Space)")
        par(mfrow=c(1,1))
    }

    ans
}
if (F) {                                # Unit Test
    X <- matrix(rnorm(1000), 20, 50)
    y <- feature.filtering.ns(X)
    y <- feature.filtering.ns(X, df = 20)    

    x <- rnorm(100)
    y <- feature.filtering.ns(x, graph=T)
}