lang/laantas-script/Split.hs

69 lines
1.8 KiB
Haskell
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

module Split (split) where
import Glyphs
import Text.Megaparsec
import Text.Megaparsec.Char
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Char as Char
import Data.Map (Map, (!))
import qualified Data.Map.Strict as Map
import Data.Void
type P = Parsec Void Text
lcChar :: P Char
lcChar = Char.toLower <$> anySingle
longestWith :: String -> (Text -> Maybe a) -> P a
longestWith name p = try $ go . Text.singleton =<< lcChar where
go acc = case p acc of
Nothing -> fail $ "longestWith " <> name <> ": " <> show acc
Just x -> try (do c <- lcChar; go $ Text.snoc acc c) <|> pure x
unthorn :: Text -> Text
unthorn = Text.map \case 'þ' -> 'ƶ'; 'ð' -> 'ƶ'; c -> c
maxFrom :: String -> Map Text a -> P a
maxFrom name i = longestWith name \x -> Map.lookup (unthorn x) i
initial :: P Glyph
initial = maxFrom "initial" initials
medial :: P [Diacritic]
medial = [] <$ chunk "\\" <|> maxFrom "medial" medials
final :: P Glyph
final = maxFrom "final" finals
ivowel :: P Glyph
ivowel = maxFrom "vowel" vowels
word :: P [EGlyph]
word = [is <> f <> concat p | is <- some initMed, f <- fin, p <- many punct]
where
initMed = try $
(,) <$> initial <*> medial <|>
[(v, []) | v <- ivowel] <|>
dash
fin = maybe [] (\x -> [(x, [])]) <$> optional final
number :: P [EGlyph]
number = some (digit <|> hash) where
hash = (num, []) <$ chunk "#"
digit = [(numbers ! Char.digitToInt i, []) | i <- digitChar]
punct :: P [EGlyph]
punct = [[(p, [])] | p <- maxFrom "punctuation" punctuation] <* space
dash :: P EGlyph
dash = (wave, []) <$ chunk ""
text :: P [[EGlyph]]
text = space *> many (segment <* space) <* eof where
segment = punct <|> number <|> word
split :: Text -> [[EGlyph]]
split = either (error . errorBundlePretty) id . parse text ""