Wednesday, June 28, 2017

Span: A Predicate Functional that finds the location of the longest sequential run of elements where the predicate is true

Span <- function(f, x)
{ 
    ## Purpose: A Predicate Functional that finds the location of the longest sequential run
    ##          of elements where the predicate is true. 
    ## Arguments:
    ##   f: a predicate (a function that returns a single TRUE or FALSE) 
    ##   x: a list or data frame
    ## Return: The location of the longest sequential run of elements where the predicate is true. 
    ## Author: Feiming Chen, Date: 28 Jun 2017, 09:45
    ## ________________________________________________

    y <- rle(Where(f, x))
    z <- y$lengths
    w <- which.max(z)          # position of the segment with longest sequential run of TRUE
    pos.end <- sum(z[1:w])
    pos.start <- pos.end - z[w] + 1
    c(start=pos.start, end=pos.end)
}
if (F) {                                # Unit Test
    x <- c(3, 1, NA, 4, NA, NA, NA, 5, 9, NA, NA)
    Span(is.na, x)
    ## start   end 
    ##     5     7 
}

Where <- function(f, x)
{ 
    ## Purpose: A Predicate Functional
    ## See other Predicate Functionals: Filter(), Find(), Position()
    ## See other Functionals: Map(), Reduce() 
    ## Arguments:
    ##   f: a predicate (a function that returns a single TRUE or FALSE) (e.g. is.character, all, is.NULL)
    ##   x: a list or data frame
    ## Return: a logical vector 
    ## Author: Feiming Chen, Date: 28 Jun 2017, 09:45
    ## ________________________________________________

    vapply(x, f, logical(1))
}
if (F) {                                # Unit Test
    df <- data.frame(x = 1:3, y = c("a", "b", "c"), stringsAsFactors = T)
    Where(is.factor, df)
    ## Compare with: 
    Filter(is.factor, df)
    Find(is.factor, df)
    Position(is.factor, df)
}

Thursday, June 22, 2017

Draw a data frame as a R plot


my.table.plot0 <- function(x, 
                           highlight.matrix = NULL,
                           main = "Table", title2 = NULL, size = 20,
                           bold.lastrow = FALSE, bold.lastcol = FALSE
                           ) 
{
    ## PURPOSE: Draw a data frame as a R plot. Require R package "gridExtra". 
    ## ARGUMENT:
    ##    x: a data frame
    ##    highlight.matrix: if provided a logical matrix (same dimension as "x"). Then "TRUE" cells will be highlighted. 
    ##    main: title for the plot
    ##    title2: additional title with more verbose text
    ##    size: text size of table text
    ##    bold.lastrow: if T, make the last row bold font.
    ##    bold.lastcol: if T, make the last column bold font.
    ## RETURN: a plot
    ## DATE: 21 Nov 2016, 15:46
    ## -----------------------------------------------------

    x <- as.data.frame(x)
    n1 <- nrow(x)
    n2 <- ncol(x)
    cl <- matrix("black", n1, n2)
    cl[is.na(x)] <- "gray"

    ## Bold Total Column and Row
    cl2 <- cl
    cl2[] <- "plain"
    if (bold.lastrow) {
        cl2[n1,] <- "bold"
    }
    if (bold.lastcol) {
        cl2[, n2] <- "bold"
    }
    
    require(gridExtra)
    table_theme <- ttheme_default(base_size = size,
                                  core = list(fg_params=list(col=cl, fontface = cl2)))
    ## Highlight Cells
    if (!is.null(highlight.matrix)) {
        cl3 <- cl
        z <- ttheme_default()$core$bg_params$fill # default color
        z1 <- rep(z, length.out = nrow(cl3))
        for (i in 1:ncol(cl3)) cl3[,i] <- z1
        cl3[highlight.matrix] <- "yellow"
        table_theme$core$bg_params$fill <- cl3
    }

    plot.new()
    gridExtra::grid.table(as.data.frame(x), theme=table_theme)
    ## Title Text
    t2 <- ifelse(is.null(title2), "", paste0("\n", graph.title.split.lines(title2, sep=" ", num.chars = 40)))
    title(main = paste0(main, t2))

}
if (F) {                                # Unit Test 
    x <- data.frame(Type=c('Car', 'Plane'), Speed=c(NA, 200))
    my.table.plot0(x)
    my.table.plot0(x, main='Test', title2='a very complex test', size=30, bold.lastcol = T, highlight.matrix = matrix(c(NA, F, F, T) , 2, 2))
    x <- data.frame(Type=rep('Car', 20), Speed=rep(100, 20))
    my.table.plot0(x)
}

my.table.plot <- function(x, highlight.matrix = NULL, bold.lastrow = FALSE, n = 20, ...)
{ 
    ## Purpose: Split a data frame to fit in each page and call "my.table.plot0". 
    ## Arguments:
    ##   x: a data frame or matrix
    ##   highlight.matrix: if provided a logical matrix (same dimension as "x"). Then "TRUE" cells will be highlighted. 
    ##   bold.lastrow: if T, make the last row bold font.
    ##   n: how many rows to fit into each page.
    ##   ...: pass to "my.table.plot0". 
    ## Return: multiple plots to plot the data matrix. 
    ## Author: Feiming Chen, Date:  5 Jul 2017, 15:16
    ## ________________________________________________

    x <- as.data.frame(x)
    N <- nrow(x)
    if ( N <= n ) {
        my.table.plot0(x, highlight.matrix = highlight.matrix, bold.lastrow = bold.lastrow, ...)
    } else {
        s <- rep(seq.int(N %/% 20 + 1), each = n, length.out = N)
        xs <- split(x, s)
        
        ng <- length(xs)                # number of pages
        bold.lastrow.list <- c(rep(FALSE, ng-1), bold.lastrow)
        
        if (is.null(highlight.matrix)) {
            mapply(my.table.plot0, xs, bold.lastrow = bold.lastrow.list, MoreArgs = list(...))
        } else {
            ys <- split(highlight.matrix, s)
            mapply(my.table.plot0, xs, highlight.matrix = ys, bold.lastrow = bold.lastrow.list, MoreArgs = list(...))
        }
    }
}
if (F) {                                # Unit Test
     x <- data.frame(Type=rep('Car', 50), Speed=rep(100, 50))
     h <- matrix(rep(c(F, T), 50), ncol=2)
     my.table.plot(x, highlight.matrix = h, bold.lastrow = T)
}

Wednesday, June 21, 2017

Add a "Total" level to each specified categorical variable in the data frame


df.add.total.category  <- function(dat, var.list, new.level.label = "Total")
{ 
    ## Purpose: Add a "Total" level to each specified categorical variable in the data frame. 
    ## Arguments:
    ##   dat: a data frame with several categorical variables for making tables.
    ##   var.list: a list of variable names in "dat".
    ##   new.level.label: the name for the new "total" level for the specified variable. 
    ## Return: 
    ## Author: Feiming Chen, Date: 21 Jun 2017, 13:02
    ## ________________________________________________

    if ( missing(var.list) )  {
        var.list <- names(dat)
    }

    d0 <- dat                           # to make new label across all variables
    lapply(var.list,  function( v )  {
        d <- dat
        d[[v]] <- new.level.label
        d0[[v]] <<- new.level.label
        d
    }) -> dd

    do.call(rbind, c(list(dat), dd, list(d0)))
}
if (F) {                                # Unit Test
    dat <- data.frame(A=gl(3,3), B=gl(3, 1, 9), C=11:19)
    df.add.total.category(dat, var.list=c("A", "B"))
}

Monday, June 19, 2017

Put unique elements of a string vector into one long string

paste.unique.string  <- function(x, sep = "/")
{ 
    ## Purpose: Put unique elements of a string vector into one long string. 
    ## Arguments:
    ##    x: a string vector
    ##    sep: what separator to use when pasting all unique elements together. 
    ## Return:  a single string with unique elements of "x". 
    ## Author: Feiming Chen, Date: 19 Jun 2017, 14:00
    ## ________________________________________________
    
    paste0(unique(x), collapse = sep)
}
if (F) {                                # Unit Test
    paste.unique.string(c("a", "b", "a", "b")) # "a/b"
}

Add "0" to the front of numbers to make n-digit string

prepend.zero.to.numbers  <- function(x, n = NULL, pad.char = "0")
{ 
    ## Purpose: Add "0" to the front of numbers to make n-digit string. 
    ## Arguments:
    ##   x: a vector of integers/characters
    ##   n: Number of total digits after padding with zero. Default to
    ##      using the largest number to calculate "n".
    ##   pad.char: what character to pad.  Default to "0". 
    ## Return: a character vector with nuumbers padded with zero's in the head.
    ## Author: Feiming Chen, Date: 19 Jun 2017, 13:19
    ## ________________________________________________

    x1 <- as.character(x)
    y <- nchar(x1)
    if (is.null(n)) n <- nchar(as.character(max(x)))
    y1 <- n - y                         # how long is the padding for each number
    pad.vector <- sapply(y1, function(w) paste(rep(pad.char, w), collapse = ""))
    paste0(pad.vector, x1)
}
if (F) {                                # Unit Test
    x <- c(2, 14, 156, 1892)
    prepend.zero.to.numbers(x)          # "0002" "0014" "0156" "1892"
    prepend.zero.to.numbers(x, pad.char = "X") # "XXX2" "XX14" "X156" "1892"
    prepend.zero.to.numbers(x, n = 6)          # "000002" "000014" "000156" "001892"
    y <- c("A", "BC", "DEF")
    prepend.zero.to.numbers(y, 3) # "00A" "0BC" "DEF"
}

Parse a string to generate a data frame


parse2data.frame  <- function(x)
{ 
    ## Purpose: Parse a string to generate a data frame.
    ## Arguments:
    ##   x: a string with header and multiple rows of text that resembles a data frame.
    ##      Fields are separated by comma. 
    ## Return: a data frame
    ## Author: Feiming Chen, Date: 19 Jun 2017, 10:12
    ## ________________________________________________

    d <- read.csv(textConnection(x), as.is = T)
    i <- sapply(d, is.character)
    d[i] <- lapply(d[i], trimws)
    d
}
if (F) {                                # Unit Test
    a = 
"

  Test, ID, Name
  t1 ,  3,  Sun
  t2 ,  5,  Moon
  t3,   2,  Earth

"    
    b <- parse2data.frame(a)
    str(b)
    ## 'data.frame':    3 obs. of  2 variables:
    ##  $ ID  : int  3 5 2
    ##  $ Name: Factor w/ 3 levels " Earth"," Moon",..: 3 2 1
}

Thursday, June 15, 2017

Tokenize a string into a vector of tokens

tokenize.string <- function(x, split = "[ ,:;]+")
{ 
    ## Purpose: Tokenize a string into a vector of tokens
    ## Arguments:
    ##   x: a string or a vector of strings
    ##   split: split characters (regular expression)
    ## Return: a character vector (if "x" is a string) or a list of character vectors. 
    ## Author: Feiming Chen, Date: 15 Jun 2017, 15:01
    ## ________________________________________________
    
    ans <- strsplit(x, split="[ ,:;]+", fixed=F)
    if (length(x) == 1) ans <- ans[[1]]
    ans
}
if (F) {                                # Unit Test
    x <- "IND,  UNR   INC ; TCU"
    tokenize.string(x)
    ## [1] "IND" "UNR" "INC" "TCU"
    tokenize.string(rep(x, 3))
    ## [[1]]
    ## [1] "IND" "UNR" "INC" "TCU"

    ## [[2]]
    ## [1] "IND" "UNR" "INC" "TCU"

    ## [[3]]
    ## [1] "IND" "UNR" "INC" "TCU"
}

Wednesday, June 14, 2017

Count the number of distinct values in a vector or distinct rows in a data frame

N.levels <- function(x)
{ 
    ## Purpose: Count the number of distinct values in a vector or distinct rows in a data frame
    ## Arguments:
    ##    x: a vector (numeric or string), or a data frame. 
    ## Return: a count for the distinct values/rows in the vector or data frame. 
    ## Author: Feiming Chen, Date: 20 Mar 2017, 10:58
    ## ________________________________________________

    sum(!duplicated(x))
}
if (F) {                                # Unit Test
    N.levels(c(1,2,1,3))                # 3
    N.levels(c("a", "b", "a"))          # 2
    x <- data.frame(a=c(1,2,1), b=c(1, 2, 1))
    N.levels(x) # 2
}

Tuesday, June 6, 2017

Write data tables to Excel (.xlsx) file


wxls <- function(x, file = "test", ...)
{ 
    ## Purpose: Write data tables to Excel (.xlsx) file.
    ##          Require package "openxlsx". 
    ## Arguments:
    ##   x: a data frame or a list of data frames. 
    ##   file: a naked file name with no extension.
    ##   ...: passed to "write.xlsx". 
    ## Return: Generate an Excel (.xlsx) file.
    ## Author: Feiming Chen, Date:  6 Jun 2017, 15:11
    ## ________________________________________________
    
    require(openxlsx)
    f <- paste0(file, ".xlsx")

    write.xlsx(x, file = f, asTable = TRUE,
               creator = "Feiming Chen", 
               tableStyle = "TableStyleMedium2",   ...)
}
if (F) {                                # Unit Test
    df <- data.frame("Date" = Sys.Date()-0:4,
                     "Logical" = c(TRUE, FALSE, TRUE, TRUE, FALSE),
                     "Currency" = paste("$",-2:2),
                     "Accounting" = -2:2,
                     "hLink" = "https://CRAN.R-project.org/",
                     "Percentage" = seq(-1, 1, length.out=5),
                     "TinyNumber" = runif(5) / 1E9, stringsAsFactors = FALSE)
    class(df$Currency) <- "currency"
    class(df$Accounting) <- "accounting"
    class(df$hLink) <- "hyperlink"
    class(df$Percentage) <- "percentage"
    class(df$TinyNumber) <- "scientific"

    wxls(df)

    wxls(list(A = df, B = df, C= df))   # write to 3 separate tabs with corresponding list names. 
}

Monday, June 5, 2017

Plot "y" against "x" where "x" is a vector of labels


my.plot.vs.label <- function(x, y, ...)
{ 
    ## Purpose: Plot "y" against "x" where "x" is a vector of labels (string)
    ## Arguments:
    ##   x: a string vector (labels)
    ##   y: a numeric vector or matrix
    ##   ...: passed to "matplot"
    ## Return: a plot
    ## Author: Feiming Chen, Date:  5 Jun 2017, 14:46
    ## ________________________________________________

    s <- seq(x)
    matplot(s, y, type="b", axes = F, xlab ="", ylab="", ...)
    axis(side=2)
    axis(side=1, at=s, labels=x, las=2)

    yn <- ncol(y)
    yna <- names(y)
    if (!is.null(yn) && !is.null(yna)) {
        legend("topright", legend=yna, col=1:yn, lwd=1.5, lty=1:yn)
    }
}
if (F) {                                # Unit Test
    x <- LETTERS
    y <- matrix(rnorm(260), ncol=10)
    my.plot.vs.label(x, y)
    y2 <- as.data.frame(y)
    my.plot.vs.label(x, y2)
}

Friday, June 2, 2017

Read/Import all CSV files in a specified directory

batch.read.csv.files <- function(p)
{ 
    ## Purpose: Read/Import all CSV files in a specified directory.
    ##          Require package "readr". 
    ## Arguments:
    ##   p: a file directory with CSV files. 
    ## Return:
    ##   a list, each element is a data frame (imported from the file) and its name is the corresponding file name. 
    ## Author: Feiming Chen, Date:  2 Jun 2017, 11:09
    ## ________________________________________________

    file.list <- list.files(p, pattern=".csv", full.names = TRUE, ignore.case = TRUE)

    if (length(file.list) > 0) { 
        dat <- lapply(file.list, readr::read_csv)
        names(dat) <- sapply(file.list, basename)
    } else dat <- list()

    cat("Import", length(dat), "CSV Files.\n")
    dat
}