About

Metadata

  • Last updated , by mizunashi_mana
  • License (Apache-2.0 OR MPL-2.0)
  • Categories Parsers
  • Maintained by: contact@mizunashi.work

  • Lottery factor: 0

Links

Installation

This package uses the Custom cabal build type

Package Flags

Use the -f option with cabal commands to enable flags

    develop (off by default)

    Turn on some options for development

Readme

Tlex: A Generator for Lexical Analysers

Hackage

Installation

Add dependencies on package.cabal:

build-depends:
    base,
    bytestring,
    tlex,          -- main
    tlex-encoding, -- for utf8 parsing
    tlex-th,       -- for outputing lexer with Template Haskell
    charset,
    template-haskell,

Usage

Setup:

import qualified Data.CharSet                        as CharSet
import qualified Data.Word                           as Word
import qualified Language.Haskell.TH                 as TH
import qualified Language.Lexer.Tlex                 as Tlex
import qualified Language.Lexer.Tlex.Plugin.Encoding as TlexEnc
import qualified Language.Lexer.Tlex.Plugin.TH       as TlexTH


type LexerState = ()
type LexerAction = [LexerCodeUnit] -> Token
type LexerCodeUnit = Word.Word8

type ScannerBuilder = TlexTH.THScannerBuilder LexerState LexerCodeUnit LexerAction
type Pattern = Tlex.Pattern LexerCodeUnit

rule :: Pattern -> TH.Q (TH.TExp LexerAction) -> ScannerBuilder ()
rule = TlexTH.thLexRule [()]

Setup charSetP:

charSetP :: CharSet.CharSet -> Pattern
charSetP cs = TlexEnc.charSetP TlexEnc.charSetPUtf8 cs

chP :: Char -> Pattern
chP c = TlexEnc.chP TlexEnc.charSetPUtf8 c

Write lexer rules:

buildLexer :: TH.Q [TH.Dec]
buildLexer = do
    lexer <- TlexTH.buildTHScannerWithReify lexerRules
    TlexTH.outputScanner lexer

data Token
    = TokWhiteSpace [LexerCodeUnit]
    | TokSmallAlpha [LexerCodeUnit]
    | TokLargeAlpha [LexerCodeUnit]
    | TokDigit [LexerCodeUnit]

lexerRules :: ScannerBuilder ()
lexerRules = do
    rule (Tlex.someP whitecharP) [||TokWhiteSpace||]
    rule (charSetP $ CharSet.range 'a' 'z') [||TokSmallAlpha||]
    rule (charSetP $ CharSet.range 'A' 'Z') [||TokLargeAlpha||]
    rule (charSetP $ CharSet.range '0' '9') [||TokDigit||]

whitecharP = Tlex.orP
    [ chP ' '
    , '\t'
    , '\n'
    , '\r'
    ]

Build lexer:

$(Lexer.Rules.buildLexer)

newtype InputByteString a = InputByteString
    { unInputByteString :: ByteString -> Int -> (a, Int)
    }
    deriving (Functor, Applicative, Monad)
        via (ReaderT ByteString (State Int))

runInputByteString :: InputByteString a -> ByteString -> (a, Int)
runInputByteString (InputByteString runner) input = runner input 0

instance TlexContext Int Word8 InputByteString where
    tlexGetInputPart = InputString $ \bs i -> (bs `indexMaybe` i, i)
    tlexGetMark = InputByteString $ \bs i -> (i, i)

lexByteString :: ByteString.ByteString -> Maybe [ByteString.ByteString]
lexByteString s0 = go s0 id where
    go s acc = case runInputByteString (tlexScan ()) s of
        (TlexEndOfInput, _)     -> Just $ acc []
        (TlexNotAccepted, _)    -> Nothing
        (TlexAccepted n act, _) ->
            let (consumed, rest) = splitAt n s
                token = act consumed
            in go rest $ \n -> acc act:n

Examples