normalizeNfc
This commit is contained in:
parent
c743a99356
commit
274ecfb58c
5 changed files with 99 additions and 60 deletions
|
@ -1,4 +1,4 @@
|
|||
module Quox.CharExtra
|
||||
module Quox.Unicode
|
||||
|
||||
import Generics.Derive
|
||||
|
||||
|
@ -133,3 +133,8 @@ isWhitespace : Char -> Bool
|
|||
isWhitespace ch =
|
||||
ch == '\t' || ch == '\r' || ch == '\n' ||
|
||||
case genCat ch of Separator _ => True; _ => False
|
||||
|
||||
|
||||
export
|
||||
%foreign "scheme:string-normalize-nfc"
|
||||
normalizeNfc : String -> String
|
|
@ -9,7 +9,7 @@ depends = base, contrib, elab-util, sop, snocvect
|
|||
|
||||
modules =
|
||||
Quox.NatExtra,
|
||||
Quox.CharExtra,
|
||||
Quox.Unicode,
|
||||
Quox.OPE,
|
||||
Quox.Pretty,
|
||||
Quox.Syntax,
|
||||
|
|
|
@ -2,7 +2,7 @@ module Tests
|
|||
|
||||
import Options
|
||||
import TAP
|
||||
import Tests.CharExtra
|
||||
import Tests.Unicode
|
||||
import Tests.Lexer
|
||||
import Tests.Parser
|
||||
import Tests.Equal
|
||||
|
@ -10,7 +10,7 @@ import System
|
|||
|
||||
|
||||
allTests = [
|
||||
CharExtra.tests,
|
||||
Unicode.tests,
|
||||
Lexer.tests,
|
||||
Parser.tests,
|
||||
Equal.tests
|
||||
|
|
|
@ -1,56 +0,0 @@
|
|||
module Tests.CharExtra
|
||||
|
||||
import Quox.NatExtra
|
||||
import Quox.CharExtra
|
||||
import Data.List
|
||||
import Data.String
|
||||
import Data.Maybe
|
||||
import TAP
|
||||
|
||||
|
||||
escape : Char -> Maybe String
|
||||
escape '\'' = Nothing
|
||||
escape c =
|
||||
if ord c > 0xFF then Nothing else
|
||||
case unpack $ show c of
|
||||
'\'' :: '\\' :: cs => pack . ('\\' ::) <$> init' cs
|
||||
_ => Nothing
|
||||
|
||||
display : Char -> String
|
||||
display c =
|
||||
let ord = cast c {to = Nat} in
|
||||
let c' = fromMaybe (singleton c) $ escape c in
|
||||
if 0x20 <= ord && ord <= 0xFF then
|
||||
"「\{c'}」"
|
||||
else
|
||||
let codepoint = padLeft 4 '0' $ showHex ord in
|
||||
"「\{c'}」 (U+\{codepoint})"
|
||||
|
||||
testOne : (Char -> Bool) -> Char -> Test
|
||||
testOne pred c = test (display c) $ unless (pred c) $ Left ()
|
||||
|
||||
testAll : String -> (Char -> Bool) -> List Char -> Test
|
||||
testAll label pred chars = label :- map (testOne pred) chars
|
||||
|
||||
|
||||
tests = "char extras" :- [
|
||||
testAll "id starts" isIdStart
|
||||
['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '𝟙'],
|
||||
testAll "not id starts" (not . isIdStart)
|
||||
['0', '_', '-', '‿', ' ', '[', ',', '.', '\1'],
|
||||
testAll "id continuations" isIdCont
|
||||
['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '0', '\''],
|
||||
testAll "not id continuations" (not . isIdCont)
|
||||
['_', '‿', ' ', '[', ',', '.', '\1'],
|
||||
testAll "id connectors" isIdConnector
|
||||
['_', '‿'],
|
||||
testAll "not id connectors" (not . isIdConnector)
|
||||
['a', ' ', ',', '-'],
|
||||
testAll "white space" isWhitespace
|
||||
[' ', '\t', '\r', '\n',
|
||||
'\x2028', -- line separator
|
||||
'\x2029' -- paragraph separator
|
||||
],
|
||||
testAll "not white space" (not . isWhitespace)
|
||||
['a', '-', '_', '\1']
|
||||
]
|
90
tests/Tests/Unicode.idr
Normal file
90
tests/Tests/Unicode.idr
Normal file
|
@ -0,0 +1,90 @@
|
|||
module Tests.Unicode
|
||||
|
||||
import Quox.NatExtra
|
||||
import Quox.Unicode
|
||||
import Data.List
|
||||
import Data.String
|
||||
import Data.Maybe
|
||||
import TAP
|
||||
|
||||
|
||||
maxLatin1 = '\xFF'
|
||||
|
||||
escape : Char -> Maybe String
|
||||
escape '\'' = Nothing
|
||||
escape c =
|
||||
if c > maxLatin1 then Nothing else
|
||||
case unpack $ show c of
|
||||
'\'' :: '\\' :: cs => pack . ('\\' ::) <$> init' cs
|
||||
_ => Nothing
|
||||
|
||||
codepoint : Char -> String
|
||||
codepoint = padLeft 4 '0' . showHex . cast
|
||||
|
||||
display : Char -> String
|
||||
display c =
|
||||
let c' = fromMaybe (singleton c) $ escape c in
|
||||
if '\x20' <= c && c <= maxLatin1
|
||||
then "「\{c'}」"
|
||||
else "「\{c'}」 (U+\{codepoint c})"
|
||||
|
||||
displayS' : String -> String
|
||||
displayS' =
|
||||
foldMap (\c => if c <= maxLatin1 then singleton c else "\\x\{codepoint c}") .
|
||||
unpack
|
||||
|
||||
displayS : String -> String
|
||||
displayS str =
|
||||
if all (<= maxLatin1) (unpack str)
|
||||
then "「\{str}」"
|
||||
else "「\{str}」 (\"\{displayS' str}\")"
|
||||
|
||||
testOneChar : (Char -> Bool) -> Char -> Test
|
||||
testOneChar pred c = test (display c) $ unless (pred c) $ Left ()
|
||||
|
||||
testAllChars : String -> (Char -> Bool) -> List Char -> Test
|
||||
testAllChars label pred chars = label :- map (testOneChar pred) chars
|
||||
|
||||
|
||||
testNfc : String -> String -> Test
|
||||
testNfc input result =
|
||||
test (displayS input) $
|
||||
let norm = normalizeNfc input in
|
||||
unless (norm == result) $
|
||||
Left [("expected", displayS result), ("received", displayS norm)]
|
||||
|
||||
testAlreadyNfc : String -> Test
|
||||
testAlreadyNfc input = testNfc input input
|
||||
|
||||
|
||||
|
||||
tests = "unicode" :- [
|
||||
"general categories" :- [
|
||||
testAllChars "id starts" isIdStart
|
||||
['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '𝟙'],
|
||||
testAllChars "not id starts" (not . isIdStart)
|
||||
['0', '_', '-', '‿', ' ', '[', ',', '.', '\1'],
|
||||
testAllChars "id continuations" isIdCont
|
||||
['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '0', '\''],
|
||||
testAllChars "not id continuations" (not . isIdCont)
|
||||
['_', '‿', ' ', '[', ',', '.', '\1'],
|
||||
testAllChars "id connectors" isIdConnector
|
||||
['_', '‿'],
|
||||
testAllChars "not id connectors" (not . isIdConnector)
|
||||
['a', ' ', ',', '-'],
|
||||
testAllChars "white space" isWhitespace
|
||||
[' ', '\t', '\r', '\n',
|
||||
'\x2028', -- line separator
|
||||
'\x2029' -- paragraph separator
|
||||
],
|
||||
testAllChars "not white space" (not . isWhitespace)
|
||||
['a', '-', '_', '\1']
|
||||
],
|
||||
|
||||
"normalisation" :- [
|
||||
testNfc "e\x301" "é",
|
||||
testAlreadyNfc "é",
|
||||
testAlreadyNfc ""
|
||||
-- idk if this is wrong it's chez's fault. or unicode's
|
||||
]
|
||||
]
|
Loading…
Reference in a new issue