2021-04-28 06:29:21 -04:00
|
|
|
|
module Split (split) where
|
|
|
|
|
|
|
|
|
|
import Glyphs
|
2021-04-29 13:01:42 -04:00
|
|
|
|
import Text.Megaparsec
|
|
|
|
|
import Text.Megaparsec.Char
|
2021-04-28 06:29:21 -04:00
|
|
|
|
import Data.Text (Text)
|
|
|
|
|
import qualified Data.Text as Text
|
|
|
|
|
import qualified Data.Char as Char
|
|
|
|
|
import Data.Map (Map, (!))
|
|
|
|
|
import qualified Data.Map.Strict as Map
|
|
|
|
|
import Data.Void
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
type P = Parsec Void Text
|
|
|
|
|
|
|
|
|
|
|
2021-04-29 05:55:54 -04:00
|
|
|
|
lcChar :: P Char
|
|
|
|
|
lcChar = Char.toLower <$> anySingle
|
|
|
|
|
|
2021-04-28 06:29:21 -04:00
|
|
|
|
longestWith :: String -> (Text -> Maybe a) -> P a
|
2021-04-29 05:55:54 -04:00
|
|
|
|
longestWith name p = try $ go . Text.singleton =<< lcChar where
|
2021-04-28 06:29:21 -04:00
|
|
|
|
go acc = case p acc of
|
|
|
|
|
Nothing -> fail $ "longestWith " <> name <> ": " <> show acc
|
2021-04-29 05:55:54 -04:00
|
|
|
|
Just x -> try (do c <- lcChar; go $ Text.snoc acc c) <|> pure x
|
2021-04-28 06:29:21 -04:00
|
|
|
|
|
2023-12-25 15:44:48 -05:00
|
|
|
|
unthorn :: Text -> Text
|
|
|
|
|
unthorn = Text.map \case 'þ' -> 'ƶ'; 'ð' -> 'ƶ'; c -> c
|
|
|
|
|
|
2021-04-28 06:29:21 -04:00
|
|
|
|
maxFrom :: String -> Map Text a -> P a
|
2023-12-25 15:44:48 -05:00
|
|
|
|
maxFrom name i = longestWith name \x -> Map.lookup (unthorn x) i
|
2021-04-28 06:29:21 -04:00
|
|
|
|
|
|
|
|
|
initial :: P Glyph
|
|
|
|
|
initial = maxFrom "initial" initials
|
|
|
|
|
|
2021-05-10 18:52:17 -04:00
|
|
|
|
medial :: P [Diacritic]
|
|
|
|
|
medial = [] <$ chunk "\\" <|> maxFrom "medial" medials
|
2021-04-28 06:29:21 -04:00
|
|
|
|
|
|
|
|
|
final :: P Glyph
|
|
|
|
|
final = maxFrom "final" finals
|
|
|
|
|
|
|
|
|
|
ivowel :: P Glyph
|
|
|
|
|
ivowel = maxFrom "vowel" vowels
|
|
|
|
|
|
2021-05-10 18:52:17 -04:00
|
|
|
|
word :: P [EGlyph]
|
2021-04-29 13:01:42 -04:00
|
|
|
|
word = [is <> f <> concat p | is <- some initMed, f <- fin, p <- many punct]
|
|
|
|
|
where
|
2021-04-28 06:29:21 -04:00
|
|
|
|
initMed = try $
|
2021-05-10 18:52:17 -04:00
|
|
|
|
(,) <$> initial <*> medial <|>
|
2021-04-29 05:55:54 -04:00
|
|
|
|
[(v, []) | v <- ivowel] <|>
|
|
|
|
|
dash
|
2021-04-28 06:29:21 -04:00
|
|
|
|
fin = maybe [] (\x -> [(x, [])]) <$> optional final
|
|
|
|
|
|
2021-05-10 18:52:17 -04:00
|
|
|
|
number :: P [EGlyph]
|
2021-04-29 05:55:54 -04:00
|
|
|
|
number = some (digit <|> hash) where
|
2021-04-28 06:29:21 -04:00
|
|
|
|
hash = (num, []) <$ chunk "#"
|
|
|
|
|
digit = [(numbers ! Char.digitToInt i, []) | i <- digitChar]
|
|
|
|
|
|
2021-05-10 18:52:17 -04:00
|
|
|
|
punct :: P [EGlyph]
|
2021-04-29 13:01:42 -04:00
|
|
|
|
punct = [[(p, [])] | p <- maxFrom "punctuation" punctuation] <* space
|
2021-04-28 06:29:21 -04:00
|
|
|
|
|
2021-05-10 18:52:17 -04:00
|
|
|
|
dash :: P EGlyph
|
2021-04-29 05:55:54 -04:00
|
|
|
|
dash = (wave, []) <$ chunk "–"
|
|
|
|
|
|
2021-05-10 18:52:17 -04:00
|
|
|
|
text :: P [[EGlyph]]
|
2021-04-28 06:29:21 -04:00
|
|
|
|
text = space *> many (segment <* space) <* eof where
|
|
|
|
|
segment = punct <|> number <|> word
|
|
|
|
|
|
2021-05-10 18:52:17 -04:00
|
|
|
|
split :: Text -> [[EGlyph]]
|
2021-04-28 06:29:21 -04:00
|
|
|
|
split = either (error . errorBundlePretty) id . parse text ""
|