Here is an implementation of @CodeManiac's idea with some optimisation and dealing with edge cases.
splitter <- function(x) {
str <- strsplit(x,"")[[1]]
final <- character(0)
strTemp <- ""
count <- 0
# define escape sets
parensStart <- c("{","(")
parensClosing <- c("}",")")
parensBoth <- c("'",'"', "%")
quotes_on <- FALSE
for(i in 1:nchar(x)){
if(str[i] %in% parensBoth){
# handle quotes
strTemp <- c(strTemp,str[i])
if(!quotes_on) {
quotes_on <- TRUE
count <- 1 # no need to count here, just make it non zero
} else {
quotes_on <- FALSE
count <- 0
}
i <- i + 1
next
}
if(str[i] == "?" && count == 0){
# if found `?` reinitialise strTemp and count and append final
final <- c(final, paste(strTemp, collapse=""))
strTemp <- ""
count <- 0
i <- i + 1
next
}
strTemp <- c(strTemp,str[i])
if(str[i] %in% parensStart){
# increment count entering set
count <- count+1
} else if(str[i] %in% parensClosing){
# decrement if exiting set
count <- count-1
}
i <- i + 1
}
# append what's left
final <- c(final, paste(strTemp, collapse=""))
final
}
results :
x1 <- "1 ? 2 ? (3 ? 4) ? {5 ? (6 ? 7)}"
splitter(x1)
#> [1] "1 " " 2 " " (3 ? 4) " " {5 ? (6 ? 7)}"
x2 <- "1 ? 2 ? '3 ? 4' ? {5 ? (6 ? 7)}"
splitter(x2)
#> [1] "1 " " 2 " " '3 ? 4' " " {5 ? (6 ? 7)}"
An edge case I didn't think about when writing the question, characters between quotes are not candidates for separators
x3 <- "1 ? 2 ? '3 {(? 4' ? {5 ? (6 ? 7)}"
splitter(x3)
#> [1] "1 " " 2 " " '3 {(? 4' " " {5 ? (6 ? 7)}"
benchmark
Parsing is 10 times faster so far, though the solution above might be optimised further by using Rcpp. The parsing solution might also be optimized further.
Jan's and Onyambu's solutions are much more compact and elegant. Onyambu's handles nesting, quotes, and the the edge case of separators trapped in quotes (though not part of the question), while Jan's doesn't. And they're approwimately as fast.
regex_split_jan <- function(x){
pattern <- c("(?:\\{[^{}]*\\}|\\([^()]*\\))(*SKIP)(*FAIL)|\\?")
out <- strsplit(x, pattern, perl = T)[[1]]
out
}
regex_split_onyambu <- function(x){
pattern <- c("([({'](?:[^(){}']*|(?1))*[')}])(*SKIP)(*FAIL)|\\?")
out <- strsplit(x, pattern, perl = T)[[1]]
out
}
microbenchmark::microbenchmark(
regex_jan = as.list(parse(text=regex_split_jan(x))),
regex_onyambu = as.list(parse(text=regex_split_onyambu(x))),
loop = as.list(parse(text=splitter(x))),
parse = parse_qm_args(x)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> regex_jan 89.1 92.15 112.114 92.95 94.45 1893.5 100 b
#> regex_onyambu 91.0 93.50 116.850 94.95 96.45 2056.1 100 b
#> loop 122.0 125.95 130.289 128.30 131.20 169.8 100 b
#> parse 10.7 13.55 14.642 14.80 15.65 25.3 100 a