## for tests on Windows, confine to 1:65535 at least at first.

ts <- intToUtf8(1:2097151, TRUE)
names(ts) <- sprintf("%x", 1:2097151)
grep("[[:space:]]", ts, value = TRUE) # Solaris also has a0 180e 2007
grep("[[:space:]]", ts, value = TRUE, perl = TRUE)
grep("[[:blank:]]", ts, value = TRUE)
grep("[[:blank:]]", ts, value = TRUE, perl = TRUE)
lw <- grep("[[:lower:]]", ts, value = TRUE)
up <- grep("[[:upper:]]", ts, value = TRUE)
al <- grep("[[:alpha:]]", ts, value = TRUE)
pt <- grep("[[:punct:]]", ts, value = TRUE)
gr <- grep("[[:graph:]]", ts, value = TRUE)
pr <- grep("[[:print:]]", ts, value = TRUE)
sp <- grep("[[:space:]]", ts, value = TRUE)
ct <- grep("[[:cntrl:]]", ts, value = TRUE)
keep <- list(lw=lw,up=up,al=al,pt=pt,gr=gr,pr=pr,sp=sp,ct=ct)
str(keep)
names(pr)[length(pr)] # 16-bit on Solaris 10.
#save(keep, file="/tmp/cl.rda")

## checks of consistency with the C99 definitions
setdiff(c(up,lw), al)
intersect(al, c(pt, sp, ct)) # or digit
gr2 <- setdiff(pr, sp) # C99 defn of graph
ngr2 <- setdiff(names(pr), names(sp))
setdiff(ngr2, names(gr)) # "a0"   "2007" "202f"
setdiff(names(gr), ngr2)
## with our tables this missed 200b, zero-width space, which glibc has in gr.
## in R-patched on macOS it missed 10b3a:f
intersect(lw, c(pt, sp, ct)) # or digit
intersect(pt, c(sp, al)) # or digit
intersect(sp, c(al, gr, pt)) # or digit
intersect(up, c(pt, sp, ct)) # or digit

## on glibc, printable are c(al,pt,sp,0:9).
dg <- grep("[[:digit:]]", ts, value = TRUE)
## We are considering private use areas to be printable
x <- c(0xE000:0xF8FF, 0xF0000:0xFFFFD, 0x100000:0x10FFFD)
priv <- intToUtf8(x, TRUE)
names(priv) <- sprintf("%x", x)
xx <- c(al,pt,sp,dg,priv)
setdiff(pr, xx) ## a0 2007 202f
setdiff(names(pr), names(xx))
setdiff(names(xx), names(pr)) # \t \n \v \f \r \u2028 \u2029

## punctuation and control, Perl properties
ts <- intToUtf8(1:255, TRUE) # sets Encoding 128-255
names(ts) <- sprintf("%x", 1:255)
grep("[[:punct:]]", ts, value = TRUE) # glibc has a0 b2 b3
# Solaris only has  a1  ab  b7  bb  bf
grep("[[:punct:]]", ts, value = TRUE, perl = TRUE)
grep("\\p{P}", ts, perl = TRUE, value = TRUE) # Not $ + < = > ^ ` | ~ § ¶
grep("[[:cntrl:]]", ts, value = TRUE)
grep("[[:cntrl:]]", ts, value = TRUE, perl = TRUE)
grep("\\p{Ll}", ts, perl = TRUE, value = TRUE) # Lowercase letter

## In an 8-bit locale (but not C nor UTF-8) on Linux,
## PCRE looks beyond ASCII but seems wrong (using system isxxxx?)
ts <- intToUtf8(1:255, TRUE)
names(ts) <- sprintf("%x", 1:255)
grep("[[:space:]]", ts, value = TRUE, perl = TRUE, useBytes = TRUE)
grep("[[:blank:]]", ts, value = TRUE, perl = TRUE, useBytes = TRUE)
grep("[[:punct:]]", ts, value = TRUE, perl = TRUE, useBytes = TRUE)
grep("[[:punct:]]", ts, value = TRUE)
## both includes a0 or a1-bf, on macOS perl includes e0-ff
grep("[[:cntrl:]]", ts, value = TRUE)
grep("[[:cntrl:]]", ts, value = TRUE, perl = TRUE, useBytes = TRUE)
## includes c0-df

## ----------------------toupper/tolower --------------------------

## These should flip case or leave unchanged:
##  'If the argument is a lower-case letter, the toupper() function returns
##   the corresponding upper-case letter if there is one; otherwise,
##   the argument is returned unchanged.'

N <- 1:65533 # 65534:5 are invalid but validUTF8 accepts them.
N <- c(N, 65536:2097151)
ts <- intToUtf8(N, TRUE) # surrogate points will be NA, and more above the BMP
names(ts) <- sprintf("%x", N)
ts <- ts[!is.na(ts)]
length(ts) # 1112061
Up <- toupper(ts)
Lo <- tolower(ts)
lw <- grepl("[[:lower:]]", ts)
up <- grepl("[[:upper:]]", ts)
## look for discrepancies, which can easily happen with different versions of
## tables.
table(Up != ts)
table(Up != ts, lw)
head(ts[Up == ts & lw], 15) # l/case but with no u/case equivalent
ts[(Up != ts) & !lw] # symbols which can case-swap
table(Lo != ts)
table(Lo != ts, up)
head(ts[Lo == ts & up], 15) # u/case but with no l/case equivalent
ts[(Lo != ts) & !up] # symbols which can case-swap
## macOS had U+04f6
## which was an error in our tables as it should be u/case (and printable)

lw <- grep("[[:lower:]]", ts, value = TRUE)
table(tolower(toupper(lw)) == lw) # 18 exceptions on glibc, including µ -> M
lw[tolower(toupper(lw)) != lw]
up <- grep("[[:upper:]]", ts, value = TRUE)
table(toupper(tolower(up)) == up) # 10 exceptions on glibc
up[toupper(tolower(up)) != up]

names(Up) <- ts
Up <- Up[Up != ts]
Up[duplicated(Up)] # 18 on glibc
names(Lo) <- ts
Lo <- Lo[Lo != ts]
Lo[duplicated(Lo)] # 9 on glibc

ts <- intToUtf8(1:2097151, TRUE)
names(ts) <- sprintf("%x", 1:2097151)
wd <- nchar(ts, "w")
table(wd)
pr2 <- grepl("[[:print:]]", ts)
table(pr2, wd) # which is giving width 2 to 1 million non-printable chars.
head(ts[wd == 2 & !pr2 & !is.na(ts)])