Download or view Unicode.frink in plain text format
/* This program contains routines to handle Unicode, especially those functions
requiring downloading of Unicode tables to work correctly.
Frink already has lots of powerful Unicode-aware string functions, which can
be seen in:
https://frinklang.org/#CorrectStringParsing
Much other Unicode testing of single characters can be done through the
Java class Character. For example:
callJava["java.lang.Character", "isWhitespace", char["\t"]]
which returns true.
The java.lang.Character class is disastrous and poorly-designed, though, with
classes like getType not returning consistent bitmapped values.
*/
class Unicode
{
/** A dictionary mapping from codepoint (as integer) -> codepoint name */
class var codepointNames = undef
/** A dictionary mapping from char->chars of confusable characters. */
class var confusablesDict = undef
/** A flag indicating if the "confusables" dictionary has been loaded. */
class var confusablesLoaded = false
/** The name of the Java class that knows characters. */
class var CHARCLASS = "java.lang.Character";
/** This returns the Unicode codepoint name for a codepoint (specified as an
integer or a string containing multiple codepoints) */
class getCodepointName[i] :=
{
if codepointNames == undef
loadCodepointNames[]
if isInteger[i]
return codepointNames@i
else
if isString[i]
{
result = new array
for c = chars[i]
result.push[codepointNames@c]
return result
} else
{
result = new array
for c = i
result.push[getCodepointName[c]]
return result
}
}
/** This searches the codepoint names for values that match a specific
regular expression. */
class searchNames[pattern] :=
{
if codepointNames == undef
loadCodepointNames[]
retval = new array
for [codepoint, name] = codepointNames
if name =~ pattern
retval.push[[codepoint, char[codepoint], name]]
// Sort by codepoint
return sort[retval, byColumn[0]]
}
/** This is a human-readable name search. It takes a regular expression
and returns names that match that pattern. */
class prettySearchNames[pattern] :=
{
ret = ""
for [dec, char, name] = Unicode.searchNames[pattern]
ret = ret + toASCIIHigh[char[dec]] + "\t" + char + "\t" + name + "\n"
return ret
}
/** This is a private method that loads the dictionary of codepoint names.
*/
class loadCodepointNames[] :=
{
codepointNames = new dict
min = staticJava[CHARCLASS, "MIN_CODE_POINT"]
// max = staticJava[CHARCLASS, "MAX_CODE_POINT"]
max = 0x1F9FF
for i = min to max
if callJava[CHARCLASS, "isDefined", i]
codepointNames@i = charName[i]
}
/** deconfuse[string]: This follows the procedure in Unicode Technical
Standard #39 for deconfusing similar characters in a string.
http://www.unicode.org/reports/tr39/
Specifically, section 4, "Confusable Detection".
It uses the "confusables" table available at:
http://www.unicode.org/Public/security/latest/confusables.txt
to perform the deconfusing.
You generally don't want to call this with a single string, but to call
deconfuse[x] == deconfuse[y]
to see if two separate strings are confusable. Or, if
deconfuse[x] != x
then you might have reason to believe that the string x had confusable
characters in it, and you might want to treat it with stronger security.
It's *still* a rather weak notion of confusable, as combining characters
and accents are not folded together. You may want to first use
normalizeUnicode[str] to perform some of this normalization.
You might want to use something stronger like folding Unicode to ASCII:
https://github.com/ericxtang/sunspot/blob/deafdd55f2a9534cc96471958ea1c206430832e7/sunspot/solr/solr/conf/mapping-FoldToASCII.txt
*/
class deconfuse[str] :=
{
nfd = normalizeUnicode[str, "NFD"]
loadConfusables[]
result = new array
for c = charList[nfd]
{
if confusablesDict.containsKey[c]
result.push[confusablesDict@c]
else
result.push[c]
}
return normalizeUnicode[join["", result], "NFD"]
}
/** Internal function to load the Unicode "confusables" file. */
class loadConfusables[] :=
{
if confusablesLoaded
return
// This is a dictionary from source char to target string.
confusablesDict = new dict
// TODO: Cache this file somewhere
for line = lines["http://www.unicode.org/Public/security/latest/confusables.txt"]
{
if line =~ %r/^\s*#/
next
if [source, target] = line =~ %r/([A-F0-9]{4,6})\s*;\s*([\sA-F0-9]+)/
{
target = trim[target]
sourceStr = char[parseInt[source, 16]]
targetStr = char[map[{|x| parseInt[x,16]},
split[" ", trim[target]]]]
confusablesDict@sourceStr = targetStr
// println["$sourceStr\t$targetStr"]
}
}
confusablesLoaded = true
}
}
/*
dumpChars[x] := println["$x\t" + uc[hex[char[x]]]]
original = "Inglês"
original = "\u2487" // This is a Unicode character indicating parenthesized 20
dumpChars[original]
deconfuse = Unicode.deconfuse[original]
dumpChars[deconfuse]
dumpChars[Unicode.deconfuse["(2O)"]] // This is a letter capital O, not a zero
*/
Download or view Unicode.frink in plain text format
This is a program written in the programming language Frink.
For more information, view the Frink
Documentation or see More Sample Frink Programs.
Alan Eliasen was born 20145 days, 7 hours, 19 minutes ago.