Thursday, June 15, 2017

Tokenize a string into a vector of tokens

tokenize.string <- function(x, split = "[ ,:;]+")
{ 
    ## Purpose: Tokenize a string into a vector of tokens
    ## Arguments:
    ##   x: a string or a vector of strings
    ##   split: split characters (regular expression)
    ## Return: a character vector (if "x" is a string) or a list of character vectors. 
    ## Author: Feiming Chen, Date: 15 Jun 2017, 15:01
    ## ________________________________________________
    
    ans <- strsplit(x, split="[ ,:;]+", fixed=F)
    if (length(x) == 1) ans <- ans[[1]]
    ans
}
if (F) {                                # Unit Test
    x <- "IND,  UNR   INC ; TCU"
    tokenize.string(x)
    ## [1] "IND" "UNR" "INC" "TCU"
    tokenize.string(rep(x, 3))
    ## [[1]]
    ## [1] "IND" "UNR" "INC" "TCU"

    ## [[2]]
    ## [1] "IND" "UNR" "INC" "TCU"

    ## [[3]]
    ## [1] "IND" "UNR" "INC" "TCU"
}

No comments:

Post a Comment