/* This program contains routines to handle Unicode, especially those functions requiring downloading of Unicode tables to work correctly. Frink already has lots of powerful Unicode-aware string functions, which can be seen in: https://frinklang.org/#CorrectStringParsing Much other Unicode testing of single characters can be done through the Java class Character. For example: callJava["java.lang.Character", "isWhitespace", char["\t"]] which returns true. The java.lang.Character class is disastrous and poorly-designed, though, with classes like getType not returning consistent bitmapped values. */ class Unicode { /** A dictionary mapping from codepoint (as integer) -> codepoint name */ class var codepointNames = undef /** A dictionary mapping from char->chars of confusable characters. */ class var confusablesDict = undef /** A flag indicating if the "confusables" dictionary has been loaded. */ class var confusablesLoaded = false /** The name of the Java class that knows characters. */ class var CHARCLASS = "java.lang.Character"; /** This returns the Unicode codepoint name for a codepoint (specified as an integer or a string containing multiple codepoints) */ class getCodepointName[i] := { if codepointNames == undef loadCodepointNames[] if isInteger[i] return codepointNames@i else if isString[i] { result = new array for c = chars[i] result.push[codepointNames@c] return result } else { result = new array for c = i result.push[getCodepointName[c]] return result } } /** This searches the codepoint names for values that match a specific regular expression. */ class searchNames[pattern] := { if codepointNames == undef loadCodepointNames[] retval = new array for [codepoint, name] = codepointNames if name =~ pattern retval.push[[codepoint, char[codepoint], name]] // Sort by codepoint return sort[retval, byColumn[0]] } /** This is a human-readable name search. It takes a regular expression and returns names that match that pattern. */ class prettySearchNames[pattern] := { ret = "" for [dec, char, name] = Unicode.searchNames[pattern] ret = ret + toASCIIHigh[char[dec]] + "\t" + char + "\t" + name + "\n" return ret } /** This is a private method that loads the dictionary of codepoint names. */ class loadCodepointNames[] := { codepointNames = new dict min = staticJava[CHARCLASS, "MIN_CODE_POINT"] // max = staticJava[CHARCLASS, "MAX_CODE_POINT"] max = 0x1F9FF for i = min to max if callJava[CHARCLASS, "isDefined", i] codepointNames@i = charName[i] } /** deconfuse[string]: This follows the procedure in Unicode Technical Standard #39 for deconfusing similar characters in a string. http://www.unicode.org/reports/tr39/ Specifically, section 4, "Confusable Detection". It uses the "confusables" table available at: http://www.unicode.org/Public/security/latest/confusables.txt to perform the deconfusing. You generally don't want to call this with a single string, but to call deconfuse[x] == deconfuse[y] to see if two separate strings are confusable. Or, if deconfuse[x] != x then you might have reason to believe that the string x had confusable characters in it, and you might want to treat it with stronger security. It's *still* a rather weak notion of confusable, as combining characters and accents are not folded together. You may want to first use normalizeUnicode[str] to perform some of this normalization. You might want to use something stronger like folding Unicode to ASCII: https://github.com/ericxtang/sunspot/blob/deafdd55f2a9534cc96471958ea1c206430832e7/sunspot/solr/solr/conf/mapping-FoldToASCII.txt */ class deconfuse[str] := { nfd = normalizeUnicode[str, "NFD"] loadConfusables[] result = new array for c = charList[nfd] { if confusablesDict.containsKey[c] result.push[confusablesDict@c] else result.push[c] } return normalizeUnicode[join["", result], "NFD"] } /** Internal function to load the Unicode "confusables" file. */ class loadConfusables[] := { if confusablesLoaded return // This is a dictionary from source char to target string. confusablesDict = new dict // TODO: Cache this file somewhere for line = lines["http://www.unicode.org/Public/security/latest/confusables.txt"] { if line =~ %r/^\s*#/ next if [source, target] = line =~ %r/([A-F0-9]{4,6})\s*;\s*([\sA-F0-9]+)/ { target = trim[target] sourceStr = char[parseInt[source, 16]] targetStr = char[map[{|x| parseInt[x,16]}, split[" ", trim[target]]]] confusablesDict@sourceStr = targetStr // println["$sourceStr\t$targetStr"] } } confusablesLoaded = true } } /* dumpChars[x] := println["$x\t" + uc[hex[char[x]]]] original = "Inglês" original = "\u2487" // This is a Unicode character indicating parenthesized 20 dumpChars[original] deconfuse = Unicode.deconfuse[original] dumpChars[deconfuse] dumpChars[Unicode.deconfuse["(2O)"]] // This is a letter capital O, not a zero */