i need compare 2 names see if 1 of them nickname of other. have 2 columns of names in data frame.
names <- data.frame(in_name = c("gary",'john','james','william','bill','paul','tom','annie','bella','sue'), match_name = c('garry','jon','jimmy','paul','william','pablo','thomas','anne','belle','susan'),stringsasfactors = f) names[] <- lapply(names, toupper) names$match <- 0
i have nickname table contains pairs of nicknames. in full set names may appear in multiple rows of pairs (as in case of 'bella' rows below)
nickname_table <- data.frame(names = c('garrett,garret,gary,garry' ,'ian,john,johnie,johnnie,johnny,jon' ,'jae,james,jamey,jay,jaymes,jem,jemmy,jim,jimi,jimmie,jimmy' ,'bill,billie,billy,wil,will,william,willie,willy' ,'paul,pauly,paulie' ,'maas,thom,thomas,tom,tomas,tommie,tommy' ,'ann,anna,anne,annette,annie,nan,nancy,nanette,nannie,nanny' ,'bella,belle,ibbie,issy,izzy,sabella' ,'isabella,isabelle,bella,belle' ,'sue,sukie,susan,susann,susanna,suzie')) nickname_table[] <- lapply(nickname_table, toupper)
i avoid using loop unable work out how function call, need store found row in temp variable, in order search second name presence in same row/s. need on million pairs of names , loop slow. current loop is:
library(sqldf) i=1 (i in 1:nrow(names)) { first_name <- names[i,1] match_name <- names[i,2] if(!is.na(first_name) & !is.na(match_name) & first_name != match_name) { if (nrow(subset(nickname_table,grepl(first_name,nickname_table$names)))>= 1) { possiblematch <- subset(nickname_table,grepl(first_name,nickname_table$names)) temp1 <- unique(as.data.frame(strsplit(gsub(" ", ",",reduce(paste,unlist(possiblematch))),","), stringsasfactors = f)) colnames(temp1) <- "names" temp2 <- data.frame(match_name, stringsasfactors = f) colnames(temp2) <- "names_1" if(nrow(sqldf("select a.* temp1 left join temp2 b on a.names=b.names_1 b.names_1 not null"))>= 1) { names[i,3] <- 1 } else names[i,3] <- 0 } else names[i,3] <- 0 } else names[i,3] <- 0 }
edit: attempted create function issue length of nickname table, , strings compared unequal, vectorised comparison seem not work.
functiona <- function (innames,matchnames,nickname_table1){ if(!is.na(innames) & !is.na(matchnames) & innames != matchnames) { if (length(subset(nickname_table1,grepl(innames,nickname_table1)))>= 1) { possiblematch <- subset(nickname_table1,grepl(innames,nickname_table1)) temp1 <- unique(as.data.frame(strsplit(gsub(" ", ",",reduce(paste,unlist(possiblematch))),","), stringsasfactors = f)) colnames(temp1) <- "names" temp2 <- data.frame(matchnames, stringsasfactors = f) colnames(temp2) <- "names_1" if(nrow(sqldf("select a.* temp1 left join temp2 b on a.names=b.names_1 b.names_1 not null"))>= 1) { return <- 1 } else return <- 0 } else return <- 0 } else return <- 0 } c <- mapply(functiona,names$in_name,names$match_name,nickname_table$names)
this can put single sql statement. prepend , append comma names
, in_name
, match_name
sure not partial matches , left join (to ensure rows of names
kept) nickname_table
using condition true when there match of both in_name
, match_name
same row of names
. sqlite function instr
checks whether first argument contains second argument substring.
sqldf("select distinct in_name, match_name, names not null 'match' names left join (select ',' || names || ',' names nickname_table) on instr(names, ',' || in_name || ',') , instr(names, ',' || match_name || ',')")
giving:
in_name match_name match 1 gary garry 1 2 john jon 1 3 james jimmy 1 4 william paul 0 5 bill william 1 6 paul pablo 0 7 tom thomas 1 8 annie anne 1 9 bella belle 1 10 sue susan 1