Keboola WIKI‎ > ‎Keboola Connection (KBC)‎ > ‎2. Using KBC‎ > ‎Recipes‎ > ‎

Text Splitter in R



##The following transforms two column csvs containing customer IDs and customer comments into 4 or 5 column
### csvs containing separate rows for every 100 or less characters of the original comment, without splitting words across rows
### output includes id, text, pks and row numbers for a given comment, and, optionally, a sort_order variable equivalent to row

###As long as your input and output match the above description, you only need edit lines 11,12,14,15, and 19 across use cases

############################
### Edit only below here ###
############################

input.name<-"184069221.csv" #change to name and directory of input csv
output.name<-"sorted.csv" #change to name and directory of desired output csv

id.variable.name<-"id" #change if id column is named anything other than "id", eg. "customer.id". NOT case sensitive. do NOT remove quotes
body.variable.name<-"body" #change if body column is named anything other than "id", eg. "text". NOT case sensitive. do NOT remove quotes

#Note: output will always be a 4 column table with headers "pk","id","row",and "text"

sort.order<-TRUE 

#if sort.order = TRUE, the output will contain a fifth column, sort_order, which contains the exact same content as row

############################
### Edit only above here ###
############################

library(plyr) #includes the ddply function

data<-read.csv(input.name,header=T) #reads in data specified on line 11


###The following 13 lines puts column headers into a standard format, and converts all columns to character (string)
colnames(data)<-tolower(colnames(data))
id.variable.name<-tolower(id.variable.name)
body.variable.name<-tolower(body.variable.name)

data["id"]<-data[id.variable.name]
if(id.variable.name!="id"){
  data[id.variable.name]<-NULL
}
data$id<-as.character(data$id)

data["body"]<-data[body.variable.name]
if(body.variable.name!="body"){
  data[body.variable.name]<-NULL
}
data$body<-as.character(data$body)

###Adds a unique id to every row of the dataset
data$rownum.temp<-1:nrow(data)


###Below defines a function which does the text splitting for any given row
###Produces a list containing the ID in element [[1]], and a list of 100char or less vectors of text in element [[2]]
body.parse<-function(row){
  id<-row$id
  body<-row$body
  
  body<-tryCatch({
    strsplit(body," ")[[1]]
  }, error=function(e){
    NULL
  })
  
  blocks<-NULL
  while(length(body)>0){
    i<-0
    com<-""
    while(nchar(com) < 101){
      i<-i+1
      if(is.na(body[i])){break}
      com<-paste(body[1:i],collapse=" ")
    }
    blocks<-c(blocks,paste(body[1:i-1],collapse=" "))
    body<-body[-c(1:(i-1))]
  }
  blocks<-blocks[tryCatch({complete.cases(blocks)},error=function(e){""})]
  
  out<-list(id,blocks)
  names(out)<-c("id","body")
  out
}


### Defines a function which transforms the output of the body.parse function into the desired tabular form
### this includes producing the pk and row variables
### takes a single row as input
frame.body<-function(row){
  mylist<-body.parse(row)
  
  if(is.null(mylist$body)){
    mylist$body<-""
  }
  
  len<-length(mylist$body)
  
  temp<-as.data.frame(matrix(ncol=4,nrow=len))
  colnames(temp)<-c("pk","id","row","text")
  temp$text[(1:len)]<-mylist$body[(1:len)]
  temp$row<-(1:len)-1
  temp$id<-mylist$id
  temp$pk<-with(temp, paste0(id,"_",row))
  
  temp
}

###applies the frame.body function to each row of the data, and produces a single data.frame output
output<-ddply(data,~rownum.temp,frame.body)

###Deletes the temporary row ID defined on line 50
output$rownum.temp<-NULL

###Adds sort_order column if requested
output$sort_order<-if(sort.order){output$row}else{NULL}

###saves out transformed data with location and name specified in line 12
write.csv(output, output.name,row.names=FALSE)
Comments