lang/laantas-script/lib/Split.hs

70 lines
1.8 KiB
Haskell
Raw Normal View History

2021-04-28 12:29:21 +02:00
module Split (split) where
import Glyphs
2021-04-29 19:01:42 +02:00
import Text.Megaparsec
import Text.Megaparsec.Char
2021-04-28 12:29:21 +02:00
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Char as Char
import Data.Map (Map, (!))
import qualified Data.Map.Strict as Map
import Data.Void
type P = Parsec Void Text
2021-04-29 11:55:54 +02:00
lcChar :: P Char
lcChar = Char.toLower <$> anySingle
2021-04-28 12:29:21 +02:00
longestWith :: String -> (Text -> Maybe a) -> P a
2021-04-29 11:55:54 +02:00
longestWith name p = try $ go . Text.singleton =<< lcChar where
2021-04-28 12:29:21 +02:00
go acc = case p acc of
Nothing -> fail $ "longestWith " <> name <> ": " <> show acc
2021-04-29 11:55:54 +02:00
Just x -> try (do c <- lcChar; go $ Text.snoc acc c) <|> pure x
2021-04-28 12:29:21 +02:00
2023-12-25 21:44:48 +01:00
unthorn :: Text -> Text
unthorn = Text.map \case 'þ' -> 'ƶ'; 'ð' -> 'ƶ'; c -> c
2021-04-28 12:29:21 +02:00
maxFrom :: String -> Map Text a -> P a
2023-12-25 21:44:48 +01:00
maxFrom name i = longestWith name \x -> Map.lookup (unthorn x) i
2021-04-28 12:29:21 +02:00
initial :: P Glyph
initial = maxFrom "initial" initials
2021-05-11 00:52:17 +02:00
medial :: P [Diacritic]
medial = [] <$ chunk "\\" <|> maxFrom "medial" medials
2021-04-28 12:29:21 +02:00
final :: P Glyph
final = maxFrom "final" finals
ivowel :: P Glyph
ivowel = maxFrom "vowel" vowels
2021-05-11 00:52:17 +02:00
word :: P [EGlyph]
2021-04-29 19:01:42 +02:00
word = [is <> f <> concat p | is <- some initMed, f <- fin, p <- many punct]
where
2021-04-28 12:29:21 +02:00
initMed = try $
2021-05-11 00:52:17 +02:00
(,) <$> initial <*> medial <|>
2021-04-29 11:55:54 +02:00
[(v, []) | v <- ivowel] <|>
dash
2021-04-28 12:29:21 +02:00
fin = maybe [] (\x -> [(x, [])]) <$> optional final
2021-05-11 00:52:17 +02:00
number :: P [EGlyph]
2021-04-29 11:55:54 +02:00
number = some (digit <|> hash) where
2021-04-28 12:29:21 +02:00
hash = (num, []) <$ chunk "#"
digit = [(numbers ! Char.digitToInt i, []) | i <- digitChar]
2021-05-11 00:52:17 +02:00
punct :: P [EGlyph]
2021-04-29 19:01:42 +02:00
punct = [[(p, [])] | p <- maxFrom "punctuation" punctuation] <* space
2021-04-28 12:29:21 +02:00
2021-05-11 00:52:17 +02:00
dash :: P EGlyph
2021-04-29 11:55:54 +02:00
dash = (wave, []) <$ chunk ""
2021-05-11 00:52:17 +02:00
text :: P [[EGlyph]]
2021-04-28 12:29:21 +02:00
text = space *> many (segment <* space) <* eof where
segment = punct <|> number <|> word
2021-05-11 00:52:17 +02:00
split :: Text -> [[EGlyph]]
2021-04-28 12:29:21 +02:00
split = either (error . errorBundlePretty) id . parse text ""