lang/laantas-script/Split.hs

67 lines
1.7 KiB
Haskell
Raw Normal View History

2021-04-28 06:29:21 -04:00
module Split (split) where
import Glyphs
2021-04-29 13:01:42 -04:00
import Text.Megaparsec
import Text.Megaparsec.Char
2021-04-28 06:29:21 -04:00
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Char as Char
import Data.Map (Map, (!))
import qualified Data.Map.Strict as Map
import Data.Void
type P = Parsec Void Text
2021-04-29 05:55:54 -04:00
lcChar :: P Char
lcChar = Char.toLower <$> anySingle
2021-04-28 06:29:21 -04:00
longestWith :: String -> (Text -> Maybe a) -> P a
2021-04-29 05:55:54 -04:00
longestWith name p = try $ go . Text.singleton =<< lcChar where
2021-04-28 06:29:21 -04:00
go acc = case p acc of
Nothing -> fail $ "longestWith " <> name <> ": " <> show acc
2021-04-29 05:55:54 -04:00
Just x -> try (do c <- lcChar; go $ Text.snoc acc c) <|> pure x
2021-04-28 06:29:21 -04:00
maxFrom :: String -> Map Text a -> P a
maxFrom name i = longestWith name \x -> Map.lookup x i
initial :: P Glyph
initial = maxFrom "initial" initials
medial :: P Segs
2021-04-29 05:55:54 -04:00
medial = P [] <$ chunk "\\" <|> maxFrom "medial" medials
2021-04-28 06:29:21 -04:00
final :: P Glyph
final = maxFrom "final" finals
ivowel :: P Glyph
ivowel = maxFrom "vowel" vowels
word :: P [Piece]
2021-04-29 13:01:42 -04:00
word = [is <> f <> concat p | is <- some initMed, f <- fin, p <- many punct]
where
2021-04-28 06:29:21 -04:00
initMed = try $
2021-04-29 05:55:54 -04:00
[(i, [m]) | i <- initial, m <- medial] <|>
[(v, []) | v <- ivowel] <|>
dash
2021-04-28 06:29:21 -04:00
fin = maybe [] (\x -> [(x, [])]) <$> optional final
number :: P [Piece]
2021-04-29 05:55:54 -04:00
number = some (digit <|> hash) where
2021-04-28 06:29:21 -04:00
hash = (num, []) <$ chunk "#"
digit = [(numbers ! Char.digitToInt i, []) | i <- digitChar]
punct :: P [Piece]
2021-04-29 13:01:42 -04:00
punct = [[(p, [])] | p <- maxFrom "punctuation" punctuation] <* space
2021-04-28 06:29:21 -04:00
2021-04-29 05:55:54 -04:00
dash :: P Piece
dash = (wave, []) <$ chunk ""
2021-04-28 06:29:21 -04:00
text :: P [[Piece]]
text = space *> many (segment <* space) <* eof where
segment = punct <|> number <|> word
split :: Text -> [[Piece]]
split = either (error . errorBundlePretty) id . parse text ""