## for tests on Windows, confine to 1:65535 at least at first. ts <- intToUtf8(1:2097151, TRUE) names(ts) <- sprintf("%x", 1:2097151) grep("[[:space:]]", ts, value = TRUE) # Solaris also has a0 180e 2007 grep("[[:space:]]", ts, value = TRUE, perl = TRUE) grep("[[:blank:]]", ts, value = TRUE) grep("[[:blank:]]", ts, value = TRUE, perl = TRUE) lw <- grep("[[:lower:]]", ts, value = TRUE) up <- grep("[[:upper:]]", ts, value = TRUE) al <- grep("[[:alpha:]]", ts, value = TRUE) pt <- grep("[[:punct:]]", ts, value = TRUE) gr <- grep("[[:graph:]]", ts, value = TRUE) pr <- grep("[[:print:]]", ts, value = TRUE) sp <- grep("[[:space:]]", ts, value = TRUE) ct <- grep("[[:cntrl:]]", ts, value = TRUE) keep <- list(lw=lw,up=up,al=al,pt=pt,gr=gr,pr=pr,sp=sp,ct=ct) str(keep) names(pr)[length(pr)] # 16-bit on Solaris 10. #save(keep, file="/tmp/cl.rda") ## checks of consistency with the C99 definitions setdiff(c(up,lw), al) intersect(al, c(pt, sp, ct)) # or digit gr2 <- setdiff(pr, sp) # C99 defn of graph ngr2 <- setdiff(names(pr), names(sp)) setdiff(ngr2, names(gr)) # "a0" "2007" "202f" setdiff(names(gr), ngr2) ## with our tables this missed 200b, zero-width space, which glibc has in gr. ## in R-patched on macOS it missed 10b3a:f intersect(lw, c(pt, sp, ct)) # or digit intersect(pt, c(sp, al)) # or digit intersect(sp, c(al, gr, pt)) # or digit intersect(up, c(pt, sp, ct)) # or digit ## on glibc, printable are c(al,pt,sp,0:9). dg <- grep("[[:digit:]]", ts, value = TRUE) ## We are considering private use areas to be printable x <- c(0xE000:0xF8FF, 0xF0000:0xFFFFD, 0x100000:0x10FFFD) priv <- intToUtf8(x, TRUE) names(priv) <- sprintf("%x", x) xx <- c(al,pt,sp,dg,priv) setdiff(pr, xx) ## a0 2007 202f setdiff(names(pr), names(xx)) setdiff(names(xx), names(pr)) # \t \n \v \f \r \u2028 \u2029 ## punctuation and control, Perl properties ts <- intToUtf8(1:255, TRUE) # sets Encoding 128-255 names(ts) <- sprintf("%x", 1:255) grep("[[:punct:]]", ts, value = TRUE) # glibc has a0 b2 b3 # Solaris only has a1 ab b7 bb bf grep("[[:punct:]]", ts, value = TRUE, perl = TRUE) grep("\\p{P}", ts, perl = TRUE, value = TRUE) # Not $ + < = > ^ ` | ~ § ¶ grep("[[:cntrl:]]", ts, value = TRUE) grep("[[:cntrl:]]", ts, value = TRUE, perl = TRUE) grep("\\p{Ll}", ts, perl = TRUE, value = TRUE) # Lowercase letter ## In an 8-bit locale (but not C nor UTF-8) on Linux, ## PCRE looks beyond ASCII but seems wrong (using system isxxxx?) ts <- intToUtf8(1:255, TRUE) names(ts) <- sprintf("%x", 1:255) grep("[[:space:]]", ts, value = TRUE, perl = TRUE, useBytes = TRUE) grep("[[:blank:]]", ts, value = TRUE, perl = TRUE, useBytes = TRUE) grep("[[:punct:]]", ts, value = TRUE, perl = TRUE, useBytes = TRUE) grep("[[:punct:]]", ts, value = TRUE) ## both includes a0 or a1-bf, on macOS perl includes e0-ff grep("[[:cntrl:]]", ts, value = TRUE) grep("[[:cntrl:]]", ts, value = TRUE, perl = TRUE, useBytes = TRUE) ## includes c0-df ## ----------------------toupper/tolower -------------------------- ## These should flip case or leave unchanged: ## 'If the argument is a lower-case letter, the toupper() function returns ## the corresponding upper-case letter if there is one; otherwise, ## the argument is returned unchanged.' N <- 1:65533 # 65534:5 are invalid but validUTF8 accepts them. N <- c(N, 65536:2097151) ts <- intToUtf8(N, TRUE) # surrogate points will be NA, and more above the BMP names(ts) <- sprintf("%x", N) ts <- ts[!is.na(ts)] length(ts) # 1112061 Up <- toupper(ts) Lo <- tolower(ts) lw <- grepl("[[:lower:]]", ts) up <- grepl("[[:upper:]]", ts) ## look for discrepancies, which can easily happen with different versions of ## tables. table(Up != ts) table(Up != ts, lw) head(ts[Up == ts & lw], 15) # l/case but with no u/case equivalent ts[(Up != ts) & !lw] # symbols which can case-swap table(Lo != ts) table(Lo != ts, up) head(ts[Lo == ts & up], 15) # u/case but with no l/case equivalent ts[(Lo != ts) & !up] # symbols which can case-swap ## macOS had U+04f6 ## which was an error in our tables as it should be u/case (and printable) lw <- grep("[[:lower:]]", ts, value = TRUE) table(tolower(toupper(lw)) == lw) # 18 exceptions on glibc, including µ -> M lw[tolower(toupper(lw)) != lw] up <- grep("[[:upper:]]", ts, value = TRUE) table(toupper(tolower(up)) == up) # 10 exceptions on glibc up[toupper(tolower(up)) != up] names(Up) <- ts Up <- Up[Up != ts] Up[duplicated(Up)] # 18 on glibc names(Lo) <- ts Lo <- Lo[Lo != ts] Lo[duplicated(Lo)] # 9 on glibc ts <- intToUtf8(1:2097151, TRUE) names(ts) <- sprintf("%x", 1:2097151) wd <- nchar(ts, "w") table(wd) pr2 <- grepl("[[:print:]]", ts) table(pr2, wd) # which is giving width 2 to 1 million non-printable chars. head(ts[wd == 2 & !pr2 & !is.na(ts)])