{-# OPTIONS_GHC -fno-warn-missing-methods #-}
{-# LANGUAGE StrictData            #-}
{-# LANGUAGE BangPatterns          #-}
{-# LANGUAGE CPP                   #-}
{-# LANGUAGE DeriveFunctor         #-}
{-# LANGUAGE FlexibleInstances     #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings     #-}
{-# LANGUAGE StandaloneDeriving    #-}
{-# LANGUAGE TypeSynonymInstances  #-}
module Skylighting.Tokenizer (
    tokenize
  , TokenizerConfig(..)
  ) where

import Control.Applicative
import Control.Monad
import Control.Monad.Except
import Control.Monad.Reader
import Control.Monad.State.Strict
import qualified Data.Attoparsec.ByteString.Char8 as A
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.UTF8 as UTF8
import Data.CaseInsensitive (mk)
import Data.Char (isAlphaNum, isAscii, isDigit, isLetter, isSpace, ord)
import qualified Data.Map as Map
import qualified Data.IntMap as IntMap
import Data.Maybe (catMaybes)
import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as Text
import Data.Text.Encoding (decodeUtf8', encodeUtf8)
import Debug.Trace
import Skylighting.Regex
import Skylighting.Types
import Skylighting.Parser (resolveKeywords)
import Data.List.NonEmpty (NonEmpty((:|)), (<|), toList)
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup
#endif

newtype Captures = Captures{ Captures -> IntMap ByteString
unCaptures :: IntMap.IntMap ByteString }
  deriving (Int -> Captures -> ShowS
[Captures] -> ShowS
Captures -> [Char]
(Int -> Captures -> ShowS)
-> (Captures -> [Char]) -> ([Captures] -> ShowS) -> Show Captures
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Captures -> ShowS
showsPrec :: Int -> Captures -> ShowS
$cshow :: Captures -> [Char]
show :: Captures -> [Char]
$cshowList :: [Captures] -> ShowS
showList :: [Captures] -> ShowS
Show)

newtype ContextStack =
  ContextStack{ ContextStack -> NonEmpty (Context, Captures)
unContextStack :: NonEmpty (Context, Captures) }
  deriving (Int -> ContextStack -> ShowS
[ContextStack] -> ShowS
ContextStack -> [Char]
(Int -> ContextStack -> ShowS)
-> (ContextStack -> [Char])
-> ([ContextStack] -> ShowS)
-> Show ContextStack
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> ContextStack -> ShowS
showsPrec :: Int -> ContextStack -> ShowS
$cshow :: ContextStack -> [Char]
show :: ContextStack -> [Char]
$cshowList :: [ContextStack] -> ShowS
showList :: [ContextStack] -> ShowS
Show)

data TokenizerState = TokenizerState{
    TokenizerState -> ByteString
input               :: ByteString
  , TokenizerState -> Bool
endline             :: Bool
  , TokenizerState -> Char
prevChar            :: Char
  , TokenizerState -> ContextStack
contextStack        :: ContextStack
  , TokenizerState -> Captures
captures            :: Captures
  , TokenizerState -> Int
column              :: Int
  , TokenizerState -> Bool
lineContinuation    :: Bool
  , TokenizerState -> Maybe Int
firstNonspaceColumn :: Maybe Int
}

-- | Configuration options for 'tokenize'.
data TokenizerConfig = TokenizerConfig{
    TokenizerConfig -> SyntaxMap
syntaxMap   :: SyntaxMap  -- ^ Syntax map to use
  , TokenizerConfig -> Bool
traceOutput :: Bool       -- ^ Generate trace output for debugging
} deriving (Int -> TokenizerConfig -> ShowS
[TokenizerConfig] -> ShowS
TokenizerConfig -> [Char]
(Int -> TokenizerConfig -> ShowS)
-> (TokenizerConfig -> [Char])
-> ([TokenizerConfig] -> ShowS)
-> Show TokenizerConfig
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> TokenizerConfig -> ShowS
showsPrec :: Int -> TokenizerConfig -> ShowS
$cshow :: TokenizerConfig -> [Char]
show :: TokenizerConfig -> [Char]
$cshowList :: [TokenizerConfig] -> ShowS
showList :: [TokenizerConfig] -> ShowS
Show)

data Result e a = Success a
                | Failure
                | Error e
     deriving ((forall a b. (a -> b) -> Result e a -> Result e b)
-> (forall a b. a -> Result e b -> Result e a)
-> Functor (Result e)
forall a b. a -> Result e b -> Result e a
forall a b. (a -> b) -> Result e a -> Result e b
forall e a b. a -> Result e b -> Result e a
forall e a b. (a -> b) -> Result e a -> Result e b
forall (f :: * -> *).
(forall a b. (a -> b) -> f a -> f b)
-> (forall a b. a -> f b -> f a) -> Functor f
$cfmap :: forall e a b. (a -> b) -> Result e a -> Result e b
fmap :: forall a b. (a -> b) -> Result e a -> Result e b
$c<$ :: forall e a b. a -> Result e b -> Result e a
<$ :: forall a b. a -> Result e b -> Result e a
Functor)

deriving instance (Show a, Show e) => Show (Result e a)

newtype TokenizerM a = TM { forall a.
TokenizerM a
-> TokenizerConfig
-> TokenizerState
-> (TokenizerState, Result [Char] a)
runTokenizerM :: TokenizerConfig
                                          -> TokenizerState
                                          -> (TokenizerState, Result String a) }

mapsnd :: (a -> b) -> (c, a) -> (c, b)
mapsnd :: forall a b c. (a -> b) -> (c, a) -> (c, b)
mapsnd a -> b
f (c
x, a
y) = (c
x, a -> b
f a
y)

instance Functor TokenizerM where
  fmap :: forall a b. (a -> b) -> TokenizerM a -> TokenizerM b
fmap a -> b
f (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
g) = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] b))
-> TokenizerM b
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s -> (Result [Char] a -> Result [Char] b)
-> (TokenizerState, Result [Char] a)
-> (TokenizerState, Result [Char] b)
forall a b c. (a -> b) -> (c, a) -> (c, b)
mapsnd ((a -> b) -> Result [Char] a -> Result [Char] b
forall a b. (a -> b) -> Result [Char] a -> Result [Char] b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap a -> b
f) (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
g TokenizerConfig
c TokenizerState
s))

instance Applicative TokenizerM where
  pure :: forall a. a -> TokenizerM a
pure a
x = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, a -> Result [Char] a
forall e a. a -> Result e a
Success a
x))
  (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] (a -> b))
f) <*> :: forall a b. TokenizerM (a -> b) -> TokenizerM a -> TokenizerM b
<*> (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y) = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] b))
-> TokenizerM b
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                           case (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] (a -> b))
f TokenizerConfig
c TokenizerState
s) of
                              (TokenizerState
s', Result [Char] (a -> b)
Failure   ) -> (TokenizerState
s', Result [Char] b
forall e a. Result e a
Failure)
                              (TokenizerState
s', Error [Char]
e   ) -> (TokenizerState
s', [Char] -> Result [Char] b
forall e a. e -> Result e a
Error [Char]
e)
                              (TokenizerState
s', Success a -> b
f') ->
                                  case (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y TokenizerConfig
c TokenizerState
s') of
                                    (TokenizerState
s'', Result [Char] a
Failure   ) -> (TokenizerState
s'', Result [Char] b
forall e a. Result e a
Failure)
                                    (TokenizerState
s'', Error [Char]
e'  ) -> (TokenizerState
s'', [Char] -> Result [Char] b
forall e a. e -> Result e a
Error [Char]
e')
                                    (TokenizerState
s'', Success a
y') -> (TokenizerState
s'', b -> Result [Char] b
forall e a. a -> Result e a
Success (a -> b
f' a
y')))


instance Monad TokenizerM where
  return :: forall a. a -> TokenizerM a
return = a -> TokenizerM a
forall a. a -> TokenizerM a
forall (f :: * -> *) a. Applicative f => a -> f a
pure
  (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x) >>= :: forall a b. TokenizerM a -> (a -> TokenizerM b) -> TokenizerM b
>>= a -> TokenizerM b
f = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] b))
-> TokenizerM b
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                       case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x TokenizerConfig
c TokenizerState
s of
                            (TokenizerState
s', Result [Char] a
Failure   ) -> (TokenizerState
s', Result [Char] b
forall e a. Result e a
Failure)
                            (TokenizerState
s', Error [Char]
e   ) -> (TokenizerState
s', [Char] -> Result [Char] b
forall e a. e -> Result e a
Error [Char]
e)
                            (TokenizerState
s', Success a
x') -> TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] b)
g TokenizerConfig
c TokenizerState
s'
                              where TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] b)
g = a -> TokenizerM b
f a
x')

instance Alternative TokenizerM where
  empty :: forall a. TokenizerM a
empty = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, Result [Char] a
forall e a. Result e a
Failure))
  <|> :: forall a. TokenizerM a -> TokenizerM a -> TokenizerM a
(<|>) (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x) (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y) = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                           case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x TokenizerConfig
c TokenizerState
s of
                                (TokenizerState
_, Result [Char] a
Failure   )  -> TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y TokenizerConfig
c TokenizerState
s
                                (TokenizerState
s', Error [Char]
e   ) -> (TokenizerState
s', [Char] -> Result [Char] a
forall e a. e -> Result e a
Error [Char]
e)
                                (TokenizerState
s', Success a
x') -> (TokenizerState
s', a -> Result [Char] a
forall e a. a -> Result e a
Success a
x'))
  many :: forall a. TokenizerM a -> TokenizerM [a]
many (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x) = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] [a]))
-> TokenizerM [a]
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s ->
                    case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x TokenizerConfig
c TokenizerState
s of
                       (TokenizerState
_, Result [Char] a
Failure   )  -> (TokenizerState
s, [a] -> Result [Char] [a]
forall e a. a -> Result e a
Success [])
                       (TokenizerState
s', Error [Char]
e   ) -> (TokenizerState
s', [Char] -> Result [Char] [a]
forall e a. e -> Result e a
Error [Char]
e)
                       (TokenizerState
s', Success a
x') -> (Result [Char] [a] -> Result [Char] [a])
-> (TokenizerState, Result [Char] [a])
-> (TokenizerState, Result [Char] [a])
forall a b c. (a -> b) -> (c, a) -> (c, b)
mapsnd (([a] -> [a]) -> Result [Char] [a] -> Result [Char] [a]
forall a b. (a -> b) -> Result [Char] a -> Result [Char] b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (a
x'a -> [a] -> [a]
forall a. a -> [a] -> [a]
:)) (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] [a])
g TokenizerConfig
c TokenizerState
s')
                         where TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] [a])
g = TokenizerM a -> TokenizerM [a]
forall a. TokenizerM a -> TokenizerM [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many ((TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x))
  some :: forall a. TokenizerM a -> TokenizerM [a]
some TokenizerM a
x = (:) (a -> [a] -> [a]) -> TokenizerM a -> TokenizerM ([a] -> [a])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> TokenizerM a
x TokenizerM ([a] -> [a]) -> TokenizerM [a] -> TokenizerM [a]
forall a b. TokenizerM (a -> b) -> TokenizerM a -> TokenizerM b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> TokenizerM a -> TokenizerM [a]
forall a. TokenizerM a -> TokenizerM [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many TokenizerM a
x

instance MonadPlus TokenizerM where
  mzero :: forall a. TokenizerM a
mzero = TokenizerM a
forall a. TokenizerM a
forall (f :: * -> *) a. Alternative f => f a
empty
  mplus :: forall a. TokenizerM a -> TokenizerM a -> TokenizerM a
mplus = TokenizerM a -> TokenizerM a -> TokenizerM a
forall a. TokenizerM a -> TokenizerM a -> TokenizerM a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
(<|>)

instance MonadReader TokenizerConfig TokenizerM where
  ask :: TokenizerM TokenizerConfig
ask = (TokenizerConfig
 -> TokenizerState
 -> (TokenizerState, Result [Char] TokenizerConfig))
-> TokenizerM TokenizerConfig
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s -> (TokenizerState
s, TokenizerConfig -> Result [Char] TokenizerConfig
forall e a. a -> Result e a
Success TokenizerConfig
c))
  local :: forall a.
(TokenizerConfig -> TokenizerConfig)
-> TokenizerM a -> TokenizerM a
local TokenizerConfig -> TokenizerConfig
f (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x) = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> (TokenizerConfig -> TokenizerConfig)
-> TokenizerConfig
-> TokenizerState
-> (TokenizerState, Result [Char] a)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. TokenizerConfig -> TokenizerConfig
f)

instance MonadState TokenizerState TokenizerM where
  get :: TokenizerM TokenizerState
get = (TokenizerConfig
 -> TokenizerState
 -> (TokenizerState, Result [Char] TokenizerState))
-> TokenizerM TokenizerState
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, TokenizerState -> Result [Char] TokenizerState
forall e a. a -> Result e a
Success TokenizerState
s))
  put :: TokenizerState -> TokenizerM ()
put TokenizerState
x = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] ()))
-> TokenizerM ()
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
_ -> (TokenizerState
x, () -> Result [Char] ()
forall e a. a -> Result e a
Success ()))

instance MonadError String TokenizerM where
  throwError :: forall a. [Char] -> TokenizerM a
throwError [Char]
e = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
_ TokenizerState
s -> (TokenizerState
s, [Char] -> Result [Char] a
forall e a. e -> Result e a
Error [Char]
e))
  catchError :: forall a. TokenizerM a -> ([Char] -> TokenizerM a) -> TokenizerM a
catchError (TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x) [Char] -> TokenizerM a
f = (TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
forall a.
(TokenizerConfig
 -> TokenizerState -> (TokenizerState, Result [Char] a))
-> TokenizerM a
TM (\TokenizerConfig
c TokenizerState
s -> case TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
x TokenizerConfig
c TokenizerState
s of
                                      (TokenizerState
_, Error [Char]
e) -> let TM TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y = [Char] -> TokenizerM a
f [Char]
e in TokenizerConfig
-> TokenizerState -> (TokenizerState, Result [Char] a)
y TokenizerConfig
c TokenizerState
s
                                      (TokenizerState, Result [Char] a)
z            -> (TokenizerState, Result [Char] a)
z)

-- | Tokenize some text using 'Syntax'.
tokenize :: TokenizerConfig -> Syntax -> Text -> Either String [SourceLine]
tokenize :: TokenizerConfig -> Syntax -> Text -> Either [Char] [SourceLine]
tokenize TokenizerConfig
config Syntax
syntax Text
inp =
  Either [Char] ContextStack
eitherStack Either [Char] ContextStack
-> (ContextStack -> Either [Char] [SourceLine])
-> Either [Char] [SourceLine]
forall a b.
Either [Char] a -> (a -> Either [Char] b) -> Either [Char] b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \(!ContextStack
stack) ->
    case TokenizerM [SourceLine]
-> TokenizerConfig
-> TokenizerState
-> (TokenizerState, Result [Char] [SourceLine])
forall a.
TokenizerM a
-> TokenizerConfig
-> TokenizerState
-> (TokenizerState, Result [Char] a)
runTokenizerM TokenizerM [SourceLine]
action
            TokenizerConfig
config{ syntaxMap = Map.map (resolveKeywords (syntaxMap config))
                                          (syntaxMap config) }
            (ContextStack -> TokenizerState
startingState ContextStack
stack) of
       (TokenizerState
_, Success [SourceLine]
ls) -> [SourceLine] -> Either [Char] [SourceLine]
forall a b. b -> Either a b
Right [SourceLine]
ls
       (TokenizerState
_, Error [Char]
e)    -> [Char] -> Either [Char] [SourceLine]
forall a b. a -> Either a b
Left [Char]
e
       (TokenizerState
_, Result [Char] [SourceLine]
Failure)    -> [Char] -> Either [Char] [SourceLine]
forall a b. a -> Either a b
Left [Char]
"Could not tokenize code"
  where
    action :: TokenizerM [SourceLine]
action = ((ByteString, Int) -> TokenizerM SourceLine)
-> [(ByteString, Int)] -> TokenizerM [SourceLine]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM (ByteString, Int) -> TokenizerM SourceLine
tokenizeLine ([ByteString] -> [Int] -> [(ByteString, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip (ByteString -> [ByteString]
BS.lines (Text -> ByteString
encodeUtf8 Text
inp)) [Int
1..])
    eitherStack :: Either [Char] ContextStack
eitherStack = case Text -> Syntax -> Maybe Context
lookupContext (Syntax -> Text
sStartingContext Syntax
syntax)
                         (SyntaxMap -> Syntax -> Syntax
resolveKeywords (TokenizerConfig -> SyntaxMap
syntaxMap TokenizerConfig
config) Syntax
syntax) of
                    Just Context
c  -> ContextStack -> Either [Char] ContextStack
forall a b. b -> Either a b
Right (ContextStack -> Either [Char] ContextStack)
-> ContextStack -> Either [Char] ContextStack
forall a b. (a -> b) -> a -> b
$ NonEmpty (Context, Captures) -> ContextStack
ContextStack ((Context
c, IntMap ByteString -> Captures
Captures IntMap ByteString
forall a. Monoid a => a
mempty) (Context, Captures)
-> [(Context, Captures)] -> NonEmpty (Context, Captures)
forall a. a -> [a] -> NonEmpty a
:| [])
                    Maybe Context
Nothing -> [Char] -> Either [Char] ContextStack
forall a b. a -> Either a b
Left [Char]
"No starting context specified"
    startingState :: ContextStack -> TokenizerState
startingState ContextStack
stack =
      TokenizerState{ input :: ByteString
input = ByteString
BS.empty
                    , endline :: Bool
endline = Text -> Bool
Text.null Text
inp
                    , prevChar :: Char
prevChar = Char
'\n'
                    , contextStack :: ContextStack
contextStack = ContextStack
stack
                    , captures :: Captures
captures = IntMap ByteString -> Captures
Captures IntMap ByteString
forall a. Monoid a => a
mempty
                    , column :: Int
column = Int
0
                    , lineContinuation :: Bool
lineContinuation = Bool
False
                    , firstNonspaceColumn :: Maybe Int
firstNonspaceColumn = Maybe Int
forall a. Maybe a
Nothing
                    }

info :: String -> TokenizerM ()
info :: [Char] -> TokenizerM ()
info [Char]
s = do
  tr <- (TokenizerConfig -> Bool) -> TokenizerM Bool
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> Bool
traceOutput
  when tr $ trace s (return ())

infoContextStack :: TokenizerM ()
infoContextStack :: TokenizerM ()
infoContextStack = do
  tr <- (TokenizerConfig -> Bool) -> TokenizerM Bool
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> Bool
traceOutput
  when tr $ do
    ContextStack stack <- gets contextStack
    info $ "CONTEXT STACK " ++ show (map (cName . fst) $ toList stack)

popContextStack :: TokenizerM ()
popContextStack :: TokenizerM ()
popContextStack = do
  ContextStack cs <- (TokenizerState -> ContextStack) -> TokenizerM ContextStack
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ContextStack
contextStack
  case cs of
       ((Context, Captures)
_ :| []) -> [Char] -> TokenizerM ()
info [Char]
"WARNING: Tried to pop only element on context stack!"
       ((Context, Captures)
_ :| ((Context, Captures)
x:[(Context, Captures)]
xs)) -> do
         (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify (\TokenizerState
st -> TokenizerState
st{ contextStack = ContextStack (x :| xs) })
         TokenizerM ()
infoContextStack

pushContextStack :: Context -> TokenizerM ()
pushContextStack :: Context -> TokenizerM ()
pushContextStack Context
cont = do
  (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify (\TokenizerState
st -> TokenizerState
st{ contextStack =
                      ContextStack
                       (((cont, Captures mempty) <|) . unContextStack
                         $ contextStack st) } )
  TokenizerM ()
infoContextStack

currentContext :: TokenizerM Context
currentContext :: TokenizerM Context
currentContext = do
  ContextStack ((c,_) :| _) <- (TokenizerState -> ContextStack) -> TokenizerM ContextStack
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ContextStack
contextStack
  return c

doContextSwitch :: ContextSwitch -> TokenizerM ()
doContextSwitch :: ContextSwitch -> TokenizerM ()
doContextSwitch ContextSwitch
Pop = TokenizerM ()
popContextStack
doContextSwitch (Push (!Text
syn,!Text
c)) = do
  syntaxes <- (TokenizerConfig -> SyntaxMap) -> TokenizerM SyntaxMap
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> SyntaxMap
syntaxMap
  case Map.lookup syn syntaxes >>= lookupContext c of
       Just !Context
con -> Context -> TokenizerM ()
pushContextStack Context
con
       Maybe Context
Nothing   -> [Char] -> TokenizerM ()
forall a. [Char] -> TokenizerM a
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError ([Char] -> TokenizerM ()) -> [Char] -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ [Char]
"Unknown syntax or context: " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ (Text, Text) -> [Char]
forall a. Show a => a -> [Char]
show (Text
syn, Text
c)

doContextSwitches :: [ContextSwitch] -> TokenizerM ()
doContextSwitches :: [ContextSwitch] -> TokenizerM ()
doContextSwitches = (ContextSwitch -> TokenizerM ())
-> [ContextSwitch] -> TokenizerM ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ ContextSwitch -> TokenizerM ()
doContextSwitch

addCaptures :: TokenizerM ()
addCaptures :: TokenizerM ()
addCaptures = do
  capts <- (TokenizerState -> Captures) -> TokenizerM Captures
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Captures
captures
  if IntMap.null (unCaptures capts)
     then return ()
     else do
       ContextStack ((c,_) :| cs) <- gets contextStack
       info $ "Adding captures to " <> show (cName c) <> ": " <> show capts
       modify $ \TokenizerState
st -> TokenizerState
st{ contextStack = ContextStack ((c,capts) :| cs) }

getCapture :: Int -> TokenizerM Text
getCapture :: Int -> TokenizerM Text
getCapture Int
capnum = do
  ContextStack ((_,Captures capts) :| _) <- (TokenizerState -> ContextStack) -> TokenizerM ContextStack
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ContextStack
contextStack
  info $ "Retrieving capture " <> show capnum
  res <- case IntMap.lookup capnum capts of
          Maybe ByteString
Nothing -> do
            [Char] -> TokenizerM ()
info [Char]
"Not found"
            TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
          Just ByteString
x  -> ByteString -> TokenizerM Text
decodeBS ByteString
x
  info $ "Got " <> show res
  return res

lookupContext :: Text -> Syntax -> Maybe Context
lookupContext :: Text -> Syntax -> Maybe Context
lookupContext Text
name Syntax
syntax | Text -> Bool
Text.null Text
name =
  if Text -> Bool
Text.null (Syntax -> Text
sStartingContext Syntax
syntax)
     then Maybe Context
forall a. Maybe a
Nothing
     else Text -> Syntax -> Maybe Context
lookupContext (Syntax -> Text
sStartingContext Syntax
syntax) Syntax
syntax
lookupContext Text
name Syntax
syntax = Text -> Map Text Context -> Maybe Context
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
name (Map Text Context -> Maybe Context)
-> Map Text Context -> Maybe Context
forall a b. (a -> b) -> a -> b
$ Syntax -> Map Text Context
sContexts Syntax
syntax

tokenizeLine :: (ByteString, Int) -> TokenizerM [Token]
tokenizeLine :: (ByteString, Int) -> TokenizerM SourceLine
tokenizeLine (!ByteString
ln, !Int
linenum) = do
  (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((TokenizerState -> TokenizerState) -> TokenizerM ())
-> (TokenizerState -> TokenizerState) -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ input = ln, endline = BS.null ln, prevChar = '\n' }
  cur <- TokenizerM Context
currentContext
  lineCont <- gets lineContinuation
  if lineCont
     then modify $ \TokenizerState
st -> TokenizerState
st{ lineContinuation = False }
     else do
       let !mbFirstNonspace = (Char -> Bool) -> ByteString -> Maybe Int
BS.findIndex (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace) (ByteString -> Maybe Int) -> ByteString -> Maybe Int
forall a b. (a -> b) -> a -> b
$! ByteString
ln
       modify $ \TokenizerState
st -> TokenizerState
st{ column = 0
                         , firstNonspaceColumn = mbFirstNonspace }
       doContextSwitches (cLineBeginContext cur)
  if BS.null ln
     then doContextSwitches (cLineEmptyContext cur)
     else doContextSwitches (cLineBeginContext cur)
  ts <- normalizeHighlighting . catMaybes <$> many getToken
  eol <- gets endline
  if eol
     then do
       currentContext >>= checkLineEnd
       return ts
     else do  -- fail if we haven't consumed whole line
       col <- gets column
       throwError $ "Could not match anything at line " ++
         show linenum ++ " column " ++ show col

getToken :: TokenizerM (Maybe Token)
getToken :: TokenizerM (Maybe (TokenType, Text))
getToken = do
  inp <- (TokenizerState -> ByteString) -> TokenizerM ByteString
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ByteString
input
  gets endline >>= guard . not
  !context <- currentContext
  msum (map (\Rule
r -> Rule -> ByteString -> TokenizerM (Maybe (TokenType, Text))
tryRule Rule
r ByteString
inp) (cRules context)) <|>
     case cFallthroughContext context of
           [] | Context -> Bool
cFallthrough Context
context -> Maybe (TokenType, Text)
forall a. Maybe a
Nothing Maybe (TokenType, Text)
-> TokenizerM () -> TokenizerM (Maybe (TokenType, Text))
forall a b. a -> TokenizerM b -> TokenizerM a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ [ContextSwitch] -> TokenizerM ()
doContextSwitches [ContextSwitch
Pop]
              | Bool
otherwise -> do
                  t <- TokenizerM Text
normalChunk
                  let mbtok = (TokenType, Text) -> Maybe (TokenType, Text)
forall a. a -> Maybe a
Just (Context -> TokenType
cAttribute Context
context, Text
t)
                  info $ "FALLTHROUGH " ++ show mbtok
                  return mbtok
           [ContextSwitch]
cs -> Maybe (TokenType, Text)
forall a. Maybe a
Nothing Maybe (TokenType, Text)
-> TokenizerM () -> TokenizerM (Maybe (TokenType, Text))
forall a b. a -> TokenizerM b -> TokenizerM a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ [ContextSwitch] -> TokenizerM ()
doContextSwitches [ContextSwitch]
cs

takeChars :: Int -> TokenizerM Text
takeChars :: Int -> TokenizerM Text
takeChars Int
0 = TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
takeChars Int
numchars = do
  inp <- (TokenizerState -> ByteString) -> TokenizerM ByteString
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ByteString
input
  let (bs,rest) = UTF8.splitAt numchars inp
  guard $ not (BS.null bs)
  !t <- decodeBS bs
  modify $ \TokenizerState
st -> TokenizerState
st{ input = rest,
                      endline = BS.null rest,
                      prevChar = Text.last t,
                      column = column st + numchars }
  return t

tryRule :: Rule -> ByteString -> TokenizerM (Maybe Token)
tryRule :: Rule -> ByteString -> TokenizerM (Maybe (TokenType, Text))
tryRule Rule
_    ByteString
""  = TokenizerM (Maybe (TokenType, Text))
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
tryRule Rule
rule ByteString
inp = do
  [Char] -> TokenizerM ()
info ([Char] -> TokenizerM ()) -> [Char] -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ [Char]
"Trying rule " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Rule -> [Char]
forall a. Show a => a -> [Char]
show Rule
rule
  case Rule -> Maybe Int
rColumn Rule
rule of
       Maybe Int
Nothing -> () -> TokenizerM ()
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return ()
       Just Int
n  -> (TokenizerState -> Int) -> TokenizerM Int
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Int
column TokenizerM Int -> (Int -> TokenizerM ()) -> TokenizerM ()
forall a b. TokenizerM a -> (a -> TokenizerM b) -> TokenizerM b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Bool -> TokenizerM ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> TokenizerM ()) -> (Int -> Bool) -> Int -> TokenizerM ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
n)

  Bool -> TokenizerM () -> TokenizerM ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Rule -> Bool
rFirstNonspace Rule
rule) (TokenizerM () -> TokenizerM ()) -> TokenizerM () -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ do
    !firstNonspace <- (TokenizerState -> Maybe Int) -> TokenizerM (Maybe Int)
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Maybe Int
firstNonspaceColumn
    !col <- gets column
    guard (firstNonspace == Just col)

  oldstate <- if Rule -> Bool
rLookahead Rule
rule
                 then TokenizerState -> Maybe TokenizerState
forall a. a -> Maybe a
Just (TokenizerState -> Maybe TokenizerState)
-> TokenizerM TokenizerState -> TokenizerM (Maybe TokenizerState)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> TokenizerM TokenizerState
forall s (m :: * -> *). MonadState s m => m s
get -- needed for lookahead rules
                 else Maybe TokenizerState -> TokenizerM (Maybe TokenizerState)
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe TokenizerState
forall a. Maybe a
Nothing

  -- reset regex captures
  modify $ \TokenizerState
st -> TokenizerState
st{ captures = Captures mempty }

  let attr = Rule -> TokenType
rAttribute Rule
rule
  mbtok <- case rMatcher rule of
                DetectChar Char
c -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ Bool -> Char -> ByteString -> TokenizerM Text
detectChar (Rule -> Bool
rDynamic Rule
rule) Char
c ByteString
inp
                Detect2Chars Char
c Char
d -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$
                                      Bool -> Char -> Char -> ByteString -> TokenizerM Text
detect2Chars (Rule -> Bool
rDynamic Rule
rule) Char
c Char
d ByteString
inp
                AnyChar Set Char
cs -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ Set Char -> ByteString -> TokenizerM Text
anyChar Set Char
cs ByteString
inp
                RangeDetect Char
c Char
d -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ Char -> Char -> ByteString -> TokenizerM Text
rangeDetect Char
c Char
d ByteString
inp
                RegExpr RE
re -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ Bool -> RE -> ByteString -> TokenizerM Text
regExpr (Rule -> Bool
rDynamic Rule
rule) RE
re ByteString
inp
                Matcher
Int -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseInt ByteString
inp
                Matcher
HlCOct -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseOct ByteString
inp
                Matcher
HlCHex -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseHex ByteString
inp
                Matcher
HlCStringChar -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseCStringChar ByteString
inp
                Matcher
HlCChar -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseCChar ByteString
inp
                Matcher
Float -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
parseFloat ByteString
inp
                Keyword KeywordAttr
_kwattr (Left Text
listname) ->
                  [Char] -> TokenizerM (Maybe (TokenType, Text))
forall a. [Char] -> TokenizerM a
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError ([Char] -> TokenizerM (Maybe (TokenType, Text)))
-> [Char] -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ [Char]
"Keyword with unresolved list " [Char] -> ShowS
forall a. Semigroup a => a -> a -> a
<> Text -> [Char]
forall a. Show a => a -> [Char]
show Text
listname
                Keyword KeywordAttr
kwattr (Right WordSet Text
kws) ->
                  TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ KeywordAttr -> WordSet Text -> ByteString -> TokenizerM Text
keyword KeywordAttr
kwattr WordSet Text
kws ByteString
inp
                StringDetect Text
s -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$
                                    Bool -> Bool -> Text -> ByteString -> TokenizerM Text
stringDetect (Rule -> Bool
rDynamic Rule
rule) (Rule -> Bool
rCaseSensitive Rule
rule)
                                                 Text
s ByteString
inp
                WordDetect Text
s -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$
                                    Bool -> Set Char -> Text -> ByteString -> TokenizerM Text
wordDetect (Rule -> Bool
rCaseSensitive Rule
rule)
                                      (Rule -> Set Char
rWeakDeliminators Rule
rule) Text
s ByteString
inp
                Matcher
LineContinue -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
lineContinue ByteString
inp
                Matcher
DetectSpaces -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
detectSpaces ByteString
inp
                Matcher
DetectIdentifier -> TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
attr (TokenizerM Text -> TokenizerM (Maybe (TokenType, Text)))
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ ByteString -> TokenizerM Text
detectIdentifier ByteString
inp
                IncludeRules (Text, Text)
cname -> Maybe TokenType
-> (Text, Text)
-> ByteString
-> TokenizerM (Maybe (TokenType, Text))
includeRules
                   (if Rule -> Bool
rIncludeAttribute Rule
rule then TokenType -> Maybe TokenType
forall a. a -> Maybe a
Just TokenType
attr else Maybe TokenType
forall a. Maybe a
Nothing)
                   (Text, Text)
cname ByteString
inp
  mbchildren <- do
    inp' <- gets input
    msum (map (\Rule
r -> Rule -> ByteString -> TokenizerM (Maybe (TokenType, Text))
tryRule Rule
r ByteString
inp') (rChildren rule)) <|> return Nothing

  mbtok' <- case mbtok of
                 Maybe (TokenType, Text)
Nothing -> Maybe (TokenType, Text) -> TokenizerM (Maybe (TokenType, Text))
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe (TokenType, Text)
forall a. Maybe a
Nothing
                 Just (TokenType
tt, Text
s)
                   | Rule -> Bool
rLookahead Rule
rule -> do
                     (oldinput, oldendline, oldprevChar, oldColumn) <-
                         case Maybe TokenizerState
oldstate of
                              Maybe TokenizerState
Nothing -> [Char] -> TokenizerM (ByteString, Bool, Char, Int)
forall a. [Char] -> TokenizerM a
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError
                                    [Char]
"oldstate not saved with lookahead rule"
                              Just TokenizerState
st -> (ByteString, Bool, Char, Int)
-> TokenizerM (ByteString, Bool, Char, Int)
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return
                                    (TokenizerState -> ByteString
input TokenizerState
st, TokenizerState -> Bool
endline TokenizerState
st,
                                     TokenizerState -> Char
prevChar TokenizerState
st, TokenizerState -> Int
column TokenizerState
st)
                     modify $ \TokenizerState
st -> TokenizerState
st{ input = oldinput
                                       , endline = oldendline
                                       , prevChar = oldprevChar
                                       , column = oldColumn }
                     return Nothing
                   | Bool
otherwise -> do
                     case Maybe (TokenType, Text)
mbchildren of
                          Maybe (TokenType, Text)
Nothing -> Maybe (TokenType, Text) -> TokenizerM (Maybe (TokenType, Text))
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe (TokenType, Text) -> TokenizerM (Maybe (TokenType, Text)))
-> Maybe (TokenType, Text) -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ (TokenType, Text) -> Maybe (TokenType, Text)
forall a. a -> Maybe a
Just (TokenType
tt, Text
s)
                          Just (TokenType
_, Text
cresult) -> Maybe (TokenType, Text) -> TokenizerM (Maybe (TokenType, Text))
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe (TokenType, Text) -> TokenizerM (Maybe (TokenType, Text)))
-> Maybe (TokenType, Text) -> TokenizerM (Maybe (TokenType, Text))
forall a b. (a -> b) -> a -> b
$ (TokenType, Text) -> Maybe (TokenType, Text)
forall a. a -> Maybe a
Just (TokenType
tt, Text
s Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
cresult)

  info $ takeWhile (/=' ') (show (rMatcher rule)) ++ " MATCHED " ++ show mbtok'
  doContextSwitches (rContextSwitch rule)
  -- Add any captures to the context on top of the stack
  addCaptures
  return mbtok'


withAttr :: TokenType -> TokenizerM Text -> TokenizerM (Maybe Token)
withAttr :: TokenType
-> TokenizerM Text -> TokenizerM (Maybe (TokenType, Text))
withAttr TokenType
tt TokenizerM Text
p = do
  res <- TokenizerM Text
p
  if Text.null res
     then return Nothing
     else return $ Just (tt, res)

wordDetect :: Bool -> Set.Set Char -> Text -> ByteString -> TokenizerM Text
wordDetect :: Bool -> Set Char -> Text -> ByteString -> TokenizerM Text
wordDetect Bool
caseSensitive Set Char
weakDelims Text
s ByteString
inp = do
  -- Removed the next line because KDE seems to allow
  -- \n<DOCTYPE! to match \b<DOCTYPE!/b:
  -- wordBoundary weakDelims inp
  t <- ByteString -> TokenizerM Text
decodeBS (ByteString -> TokenizerM Text) -> ByteString -> TokenizerM Text
forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
UTF8.take (Text -> Int
Text.length Text
s) ByteString
inp
  -- we assume here that the case fold will not change length,
  -- which is safe for ASCII keywords and the like...
  guard $ if caseSensitive
             then s == t
             else mk s == mk t
  guard $ not (Text.null t)
  let c = HasCallStack => Text -> Char
Text -> Char
Text.last Text
t
  let rest = Int -> ByteString -> ByteString
UTF8.drop (Text -> Int
Text.length Text
s) ByteString
inp
  let d = case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
rest of
               Maybe (Char, ByteString)
Nothing    -> Char
'\n'
               Just (Char
x,ByteString
_) -> Char
x
  guard $ isWordBoundary weakDelims c d
  takeChars (Text.length t)

stringDetect :: Bool -> Bool -> Text -> ByteString -> TokenizerM Text
stringDetect :: Bool -> Bool -> Text -> ByteString -> TokenizerM Text
stringDetect Bool
dynamic Bool
caseSensitive Text
s ByteString
inp = do
  s' <- if Bool
dynamic
        then do
          dynStr <- Text -> TokenizerM Text
subDynamicText Text
s
          info $ "Dynamic string: " ++ show dynStr
          return dynStr
        else Text -> TokenizerM Text
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return Text
s
  t <- decodeBS $ UTF8.take (Text.length s') inp
  -- we assume here that the case fold will not change length,
  -- which is safe for ASCII keywords and the like...
  guard $ if caseSensitive
             then s' == t
             else mk s' == mk t
  takeChars (Text.length s')

subDynamicText :: Text -> TokenizerM Text
subDynamicText :: Text -> TokenizerM Text
subDynamicText Text
t = do
  let substitute :: Text -> TokenizerM Text
substitute Text
x = case Text -> Maybe (Char, Text)
Text.uncons Text
x of
        Just (Char
c, Text
rest) | Char -> Bool
isDigit Char
c -> let capNum :: Int
capNum = Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'0'
                                      in (Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
rest) (Text -> Text) -> TokenizerM Text -> TokenizerM Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Int -> TokenizerM Text
getCapture Int
capNum
        Maybe (Char, Text)
_ -> Text -> TokenizerM Text
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> TokenizerM Text) -> Text -> TokenizerM Text
forall a b. (a -> b) -> a -> b
$ Char -> Text -> Text
Text.cons Char
'%' Text
x
  case (Char -> Bool) -> Text -> [Text]
Text.split (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'%') Text
t of
    []     -> Text -> TokenizerM Text
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return Text
Text.empty
    Text
x:[Text]
rest -> (Text
x Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<>) (Text -> Text) -> ([Text] -> Text) -> [Text] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Text] -> Text
Text.concat ([Text] -> Text) -> TokenizerM [Text] -> TokenizerM Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Text -> TokenizerM Text) -> [Text] -> TokenizerM [Text]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM Text -> TokenizerM Text
substitute [Text]
rest

-- This assumes that nothing significant will happen
-- in the middle of a string of spaces or a string
-- of alphanumerics.  This seems true  for all normal
-- programming languages, and the optimization speeds
-- things up a lot, relative to just parsing one char.
normalChunk :: TokenizerM Text
normalChunk :: TokenizerM Text
normalChunk = do
  inp <- (TokenizerState -> ByteString) -> TokenizerM ByteString
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> ByteString
input
  case UTF8.uncons inp of
    Maybe (Char, ByteString)
Nothing -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
    Just (Char
c, ByteString
_)
      | Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
' ' ->
        let bs :: ByteString
bs = (Char -> Bool) -> ByteString -> ByteString
BS.takeWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
' ') ByteString
inp
        in  Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
bs)
      | Char -> Bool
isAscii Char
c Bool -> Bool -> Bool
&& Char -> Bool
isAlphaNum Char
c ->
        let (ByteString
bs, ByteString
_) = (Char -> Bool) -> ByteString -> (ByteString, ByteString)
UTF8.span Char -> Bool
isAlphaNum ByteString
inp
        in  Int -> TokenizerM Text
takeChars (ByteString -> Int
UTF8.length ByteString
bs)
      | Bool
otherwise -> Int -> TokenizerM Text
takeChars Int
1

includeRules :: Maybe TokenType -> ContextName -> ByteString
             -> TokenizerM (Maybe Token)
includeRules :: Maybe TokenType
-> (Text, Text)
-> ByteString
-> TokenizerM (Maybe (TokenType, Text))
includeRules Maybe TokenType
mbattr (Text
syn, Text
con) ByteString
inp = do
  syntaxes <- (TokenizerConfig -> SyntaxMap) -> TokenizerM SyntaxMap
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks TokenizerConfig -> SyntaxMap
syntaxMap
  case Map.lookup syn syntaxes >>= lookupContext con of
       Maybe Context
Nothing  -> do
          cur <- TokenizerM Context
currentContext
          throwError $ "IncludeRules in " ++ Text.unpack (cSyntax cur) ++
           " requires undefined context " ++
           Text.unpack con ++ "##" ++ Text.unpack syn
       Just Context
c   -> do
         mbtok <- [TokenizerM (Maybe (TokenType, Text))]
-> TokenizerM (Maybe (TokenType, Text))
forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, MonadPlus m) =>
t (m a) -> m a
msum ((Rule -> TokenizerM (Maybe (TokenType, Text)))
-> [Rule] -> [TokenizerM (Maybe (TokenType, Text))]
forall a b. (a -> b) -> [a] -> [b]
map (\Rule
r -> Rule -> ByteString -> TokenizerM (Maybe (TokenType, Text))
tryRule Rule
r ByteString
inp) (Context -> [Rule]
cRules Context
c))
         modify $ \TokenizerState
st -> TokenizerState
st{ captures = Captures mempty }
         return $ case (mbtok, mbattr) of
                    (Just (TokenType
NormalTok, Text
xs), Just TokenType
attr) -> (TokenType, Text) -> Maybe (TokenType, Text)
forall a. a -> Maybe a
Just (TokenType
attr, Text
xs)
                    (Maybe (TokenType, Text), Maybe TokenType)
_                                 -> Maybe (TokenType, Text)
mbtok

checkLineEnd :: Context -> TokenizerM ()
checkLineEnd :: Context -> TokenizerM ()
checkLineEnd Context
c = do
  Bool -> TokenizerM () -> TokenizerM ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless ([ContextSwitch] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (Context -> [ContextSwitch]
cLineEndContext Context
c)) (TokenizerM () -> TokenizerM ()) -> TokenizerM () -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ do
    eol <- (TokenizerState -> Bool) -> TokenizerM Bool
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Bool
endline
    info $ "checkLineEnd for " ++ show (cName c) ++ " eol = " ++ show eol ++ " cLineEndContext = " ++ show (cLineEndContext c)
    when eol $ do
      lineCont' <- gets lineContinuation
      unless lineCont' $ do
        doContextSwitches (cLineEndContext c)
        c' <- currentContext
        unless (c == c') $ checkLineEnd c'

detectChar :: Bool -> Char -> ByteString -> TokenizerM Text
detectChar :: Bool -> Char -> ByteString -> TokenizerM Text
detectChar Bool
dynamic Char
c ByteString
inp = do
  c' <- if Bool
dynamic Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'9'
           then Char -> TokenizerM Char
getDynamicChar Char
c
           else Char -> TokenizerM Char
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
  case UTF8.uncons inp of
    Just (Char
x,ByteString
_) | Char
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
c' -> Int -> TokenizerM Text
takeChars Int
1
    Maybe (Char, ByteString)
_          -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero

getDynamicChar :: Char -> TokenizerM Char
getDynamicChar :: Char -> TokenizerM Char
getDynamicChar Char
c = do
  let capNum :: Int
capNum = Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'0'
  res <- Int -> TokenizerM Text
getCapture Int
capNum
  case Text.uncons res of
       Maybe (Char, Text)
Nothing    -> TokenizerM Char
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Just (Char
d,Text
_) -> Char -> TokenizerM Char
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return Char
d

detect2Chars :: Bool -> Char -> Char -> ByteString -> TokenizerM Text
detect2Chars :: Bool -> Char -> Char -> ByteString -> TokenizerM Text
detect2Chars Bool
dynamic Char
c Char
d ByteString
inp = do
  c' <- if Bool
dynamic Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'9'
           then Char -> TokenizerM Char
getDynamicChar Char
c
           else Char -> TokenizerM Char
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
  d' <- if dynamic && d >= '0' && d <= '9'
           then getDynamicChar d
           else return d
  if (encodeUtf8 (Text.pack [c',d'])) `BS.isPrefixOf` inp
     then takeChars 2
     else mzero

rangeDetect :: Char -> Char -> ByteString -> TokenizerM Text
rangeDetect :: Char -> Char -> ByteString -> TokenizerM Text
rangeDetect Char
c Char
d ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
    Just (Char
x, ByteString
rest)
      | Char
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
c -> case (Char -> Bool) -> ByteString -> (ByteString, ByteString)
UTF8.span (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
d) ByteString
rest of
                       (ByteString
in_t, ByteString
out_t)
                         | ByteString -> Bool
BS.null ByteString
out_t -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
                         | Bool
otherwise -> do
                              t <- ByteString -> TokenizerM Text
decodeBS ByteString
in_t
                              takeChars (Text.length t + 2)
    Maybe (Char, ByteString)
_ -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero

-- NOTE: currently limited to ASCII
detectSpaces :: ByteString -> TokenizerM Text
detectSpaces :: ByteString -> TokenizerM Text
detectSpaces ByteString
inp = do
  case (Char -> Bool) -> ByteString -> (ByteString, ByteString)
BS.span (\Char
c -> Char -> Bool
isSpace Char
c) ByteString
inp of
       (ByteString
t, ByteString
_)
         | ByteString -> Bool
BS.null ByteString
t -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
         | Bool
otherwise -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
t)

-- NOTE: limited to ASCII as per kate documentation
detectIdentifier :: ByteString -> TokenizerM Text
detectIdentifier :: ByteString -> TokenizerM Text
detectIdentifier ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
BS.uncons ByteString
inp of
    Just (Char
c, ByteString
t) | (Char -> Bool
isAscii Char
c Bool -> Bool -> Bool
&& Char -> Bool
isLetter Char
c) Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'_' ->
      Int -> TokenizerM Text
takeChars (Int -> TokenizerM Text) -> Int -> TokenizerM Text
forall a b. (a -> b) -> a -> b
$ Int
1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int -> (Int -> Int) -> Maybe Int -> Int
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (ByteString -> Int
BS.length ByteString
t) Int -> Int
forall a. a -> a
id
                ((Char -> Bool) -> ByteString -> Maybe Int
BS.findIndex (\Char
d -> Bool -> Bool
not (Char -> Bool
isAscii Char
d) Bool -> Bool -> Bool
||
                                     Bool -> Bool
not (Char -> Bool
isAlphaNum Char
d Bool -> Bool -> Bool
|| Char
d Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'_')) ByteString
t)
    Maybe (Char, ByteString)
_ -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero

lineContinue :: ByteString -> TokenizerM Text
lineContinue :: ByteString -> TokenizerM Text
lineContinue ByteString
inp = do
  if ByteString
inp ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString
"\\"
     then do
       (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((TokenizerState -> TokenizerState) -> TokenizerM ())
-> (TokenizerState -> TokenizerState) -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ lineContinuation = True }
       Int -> TokenizerM Text
takeChars Int
1
     else TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero

anyChar :: Set.Set Char -> ByteString -> TokenizerM Text
anyChar :: Set Char -> ByteString -> TokenizerM Text
anyChar Set Char
cs ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
     Just (Char
x, ByteString
_) | Char
x Char -> Set Char -> Bool
forall a. Ord a => a -> Set a -> Bool
`Set.member` Set Char
cs -> Int -> TokenizerM Text
takeChars Int
1
     Maybe (Char, ByteString)
_           -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero

regExpr :: Bool -> RE -> ByteString -> TokenizerM Text
regExpr :: Bool -> RE -> ByteString -> TokenizerM Text
regExpr Bool
dynamic RE
re ByteString
inp = do
  -- return $! traceShowId $! (reStr, inp)
  let reStr :: ByteString
reStr = RE -> ByteString
reString RE
re
  Bool -> TokenizerM () -> TokenizerM ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int -> ByteString -> ByteString
BS.take Int
2 ByteString
reStr ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString
"\\b") (TokenizerM () -> TokenizerM ()) -> TokenizerM () -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ Set Char -> ByteString -> TokenizerM ()
wordBoundary Set Char
forall a. Monoid a => a
mempty ByteString
inp
  regex <- case RE -> Either [Char] Regex
compileRE RE
re of
            Right Regex
r  -> Regex -> TokenizerM Regex
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return Regex
r
            Left [Char]
e   -> [Char] -> TokenizerM Regex
forall a. [Char] -> TokenizerM a
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError ([Char] -> TokenizerM Regex) -> [Char] -> TokenizerM Regex
forall a b. (a -> b) -> a -> b
$
              [Char]
"Error compiling regex " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
              ByteString -> [Char]
UTF8.toString ByteString
reStr [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
": " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
e
  regex' <- if dynamic
               then subDynamic regex
               else return regex
  case matchRegex regex' inp of
        Just (ByteString
matchedBytes, IntMap (Int, Int)
capts) -> do
          Bool -> TokenizerM () -> TokenizerM ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (IntMap (Int, Int) -> Bool
forall a. IntMap a -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null IntMap (Int, Int)
capts) (TokenizerM () -> TokenizerM ()) -> TokenizerM () -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$
            (TokenizerState -> TokenizerState) -> TokenizerM ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((TokenizerState -> TokenizerState) -> TokenizerM ())
-> (TokenizerState -> TokenizerState) -> TokenizerM ()
forall a b. (a -> b) -> a -> b
$ \TokenizerState
st -> TokenizerState
st{ captures = Captures $
                                  IntMap.map (toSlice inp) capts }
          Int -> TokenizerM Text
takeChars (ByteString -> Int
UTF8.length ByteString
matchedBytes)
        Maybe (ByteString, IntMap (Int, Int))
_ -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero

toSlice :: ByteString -> (Int, Int) -> ByteString
toSlice :: ByteString -> (Int, Int) -> ByteString
toSlice ByteString
bs (Int
off, Int
len) = Int -> ByteString -> ByteString
BS.take Int
len (ByteString -> ByteString) -> ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
BS.drop Int
off ByteString
bs

wordBoundary :: Set.Set Char -> ByteString -> TokenizerM ()
wordBoundary :: Set Char -> ByteString -> TokenizerM ()
wordBoundary Set Char
weakDelims ByteString
inp = do
  case ByteString -> Maybe (Char, ByteString)
UTF8.uncons ByteString
inp of
       Maybe (Char, ByteString)
Nothing -> () -> TokenizerM ()
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return ()
       Just (Char
d, ByteString
_) -> do
         c <- (TokenizerState -> Char) -> TokenizerM Char
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Char
prevChar
         guard $ isWordBoundary weakDelims c d

isWordBoundary :: Set.Set Char -> Char -> Char -> Bool
isWordBoundary :: Set Char -> Char -> Char -> Bool
isWordBoundary Set Char
weakDelims Char
c Char
d =
  (Char -> Bool
isWordChar Char
c Bool -> Bool -> Bool
|| Char
c Char -> Set Char -> Bool
forall a. Ord a => a -> Set a -> Bool
`Set.member` Set Char
weakDelims) Bool -> Bool -> Bool
forall a. Eq a => a -> a -> Bool
/=
  (Char -> Bool
isWordChar Char
d Bool -> Bool -> Bool
|| Char
d Char -> Set Char -> Bool
forall a. Ord a => a -> Set a -> Bool
`Set.member` Set Char
weakDelims)

decodeBS :: ByteString -> TokenizerM Text
decodeBS :: ByteString -> TokenizerM Text
decodeBS ByteString
bs = case ByteString -> Either UnicodeException Text
decodeUtf8' ByteString
bs of
                    Left UnicodeException
_ -> [Char] -> TokenizerM Text
forall a. [Char] -> TokenizerM a
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError ([Char]
"ByteString " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
                                ByteString -> [Char]
forall a. Show a => a -> [Char]
show ByteString
bs [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"is not UTF8")
                    Right Text
t -> Text -> TokenizerM Text
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return Text
t

-- Substitute out %1, %2, etc. in regex string, escaping
-- appropriately..
subDynamic :: Regex -> TokenizerM Regex
subDynamic :: Regex -> TokenizerM Regex
subDynamic (MatchDynamic Int
capNum) = do
  replacement <- Int -> TokenizerM Text
getCapture Int
capNum
  return $ mconcat $ map (MatchChar . (==)) $ Text.unpack replacement
subDynamic (MatchAlt Regex
r1 Regex
r2) =
  Regex -> Regex -> Regex
MatchAlt (Regex -> Regex -> Regex)
-> TokenizerM Regex -> TokenizerM (Regex -> Regex)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r1 TokenizerM (Regex -> Regex) -> TokenizerM Regex -> TokenizerM Regex
forall a b. TokenizerM (a -> b) -> TokenizerM a -> TokenizerM b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Regex -> TokenizerM Regex
subDynamic Regex
r2
subDynamic (MatchConcat Regex
r1 Regex
r2) =
  Regex -> Regex -> Regex
MatchConcat (Regex -> Regex -> Regex)
-> TokenizerM Regex -> TokenizerM (Regex -> Regex)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r1 TokenizerM (Regex -> Regex) -> TokenizerM Regex -> TokenizerM Regex
forall a b. TokenizerM (a -> b) -> TokenizerM a -> TokenizerM b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Regex -> TokenizerM Regex
subDynamic Regex
r2
subDynamic (MatchSome Regex
r) =
  Regex -> Regex
MatchSome (Regex -> Regex) -> TokenizerM Regex -> TokenizerM Regex
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r
subDynamic (MatchCapture Int
i Regex
r) =
  Int -> Regex -> Regex
MatchCapture Int
i (Regex -> Regex) -> TokenizerM Regex -> TokenizerM Regex
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r
subDynamic (AssertPositive Direction
dir Regex
r) =
  Direction -> Regex -> Regex
AssertPositive Direction
dir (Regex -> Regex) -> TokenizerM Regex -> TokenizerM Regex
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r
subDynamic (AssertNegative Direction
dir Regex
r) =
  Direction -> Regex -> Regex
AssertNegative Direction
dir (Regex -> Regex) -> TokenizerM Regex -> TokenizerM Regex
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Regex -> TokenizerM Regex
subDynamic Regex
r
subDynamic Regex
x = Regex -> TokenizerM Regex
forall a. a -> TokenizerM a
forall (m :: * -> *) a. Monad m => a -> m a
return Regex
x

keyword :: KeywordAttr -> WordSet Text -> ByteString -> TokenizerM Text
keyword :: KeywordAttr -> WordSet Text -> ByteString -> TokenizerM Text
keyword KeywordAttr
kwattr WordSet Text
kws ByteString
inp = do
  prev <- (TokenizerState -> Char) -> TokenizerM Char
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets TokenizerState -> Char
prevChar
  guard $ prev `Set.member` (keywordDelims kwattr)
  let (w,_) = UTF8.break (`Set.member` (keywordDelims kwattr)) inp
  guard $ not (BS.null w)
  w' <- decodeBS w
  let numchars = Text -> Int
Text.length Text
w'
  if w' `inWordSet` kws
     then takeChars numchars
     else mzero

normalizeHighlighting :: [Token] -> [Token]
normalizeHighlighting :: SourceLine -> SourceLine
normalizeHighlighting [] = []
normalizeHighlighting ((!TokenType
t,!Text
x):SourceLine
xs)
  | Text -> Bool
Text.null Text
x = SourceLine -> SourceLine
normalizeHighlighting SourceLine
xs
  | Bool
otherwise =
    (TokenType
t, Text
matchedText) (TokenType, Text) -> SourceLine -> SourceLine
forall a. a -> [a] -> [a]
: SourceLine -> SourceLine
normalizeHighlighting SourceLine
rest
    where (SourceLine
matches, SourceLine
rest) = ((TokenType, Text) -> Bool)
-> SourceLine -> (SourceLine, SourceLine)
forall a. (a -> Bool) -> [a] -> ([a], [a])
span (\(TokenType
z,Text
_) -> TokenType
z TokenType -> TokenType -> Bool
forall a. Eq a => a -> a -> Bool
== TokenType
t) SourceLine
xs
          !matchedText :: Text
matchedText = [Text] -> Text
Text.concat (Text
x Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: ((TokenType, Text) -> Text) -> SourceLine -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map (TokenType, Text) -> Text
forall a b. (a, b) -> b
snd SourceLine
matches)


parseCStringChar :: ByteString -> TokenizerM Text
parseCStringChar :: ByteString -> TokenizerM Text
parseCStringChar ByteString
inp = do
  case Parser (ByteString, ())
-> ByteString -> Either [Char] (ByteString, ())
forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pCStringChar) ByteString
inp of
       Left [Char]
_      -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pCStringChar :: A.Parser ()
pCStringChar :: Parser ()
pCStringChar = do
  _ <- Char -> Parser Char
A.char Char
'\\'
  next <- A.anyChar
  case next of
       Char
c | Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'x' Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'X' -> () () -> Parser ByteString ByteString -> Parser ()
forall a b. a -> Parser ByteString b -> Parser ByteString a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser ByteString ByteString
A.takeWhile1 ([Char] -> Char -> Bool
A.inClass [Char]
"0-9a-fA-F")
         | Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'0' -> () () -> Parser ByteString ByteString -> Parser ()
forall a b. a -> Parser ByteString b -> Parser ByteString a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser ByteString ByteString
A.takeWhile ([Char] -> Char -> Bool
A.inClass [Char]
"0-7")
         | [Char] -> Char -> Bool
A.inClass [Char]
"abefnrtv\"'?\\" Char
c -> () -> Parser ()
forall a. a -> Parser ByteString a
forall (m :: * -> *) a. Monad m => a -> m a
return ()
         | Bool
otherwise -> Parser ()
forall a. Parser ByteString a
forall (m :: * -> *) a. MonadPlus m => m a
mzero

parseCChar :: ByteString -> TokenizerM Text
parseCChar :: ByteString -> TokenizerM Text
parseCChar ByteString
inp = do
  case Parser (ByteString, ())
-> ByteString -> Either [Char] (ByteString, ())
forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pCChar) ByteString
inp of
       Left [Char]
_      -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pCChar :: A.Parser ()
pCChar :: Parser ()
pCChar = do
  () () -> Parser Char -> Parser ()
forall a b. a -> Parser ByteString b -> Parser ByteString a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Char -> Parser Char
A.char Char
'\''
  Parser ()
pCStringChar Parser () -> Parser () -> Parser ()
forall a.
Parser ByteString a -> Parser ByteString a -> Parser ByteString a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () () -> Parser Char -> Parser ()
forall a b. a -> Parser ByteString b -> Parser ByteString a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser Char
A.satisfy (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\'' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\\')
  () () -> Parser Char -> Parser ()
forall a b. a -> Parser ByteString b -> Parser ByteString a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Char -> Parser Char
A.char Char
'\''

parseInt :: ByteString -> TokenizerM Text
parseInt :: ByteString -> TokenizerM Text
parseInt ByteString
inp = do
  Set Char -> ByteString -> TokenizerM ()
wordBoundary Set Char
forall a. Monoid a => a
mempty ByteString
inp
  case Parser (ByteString, ())
-> ByteString -> Either [Char] (ByteString, ())
forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match (Parser ()
pHex Parser () -> Parser () -> Parser ()
forall a.
Parser ByteString a -> Parser ByteString a -> Parser ByteString a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
pOct Parser () -> Parser () -> Parser ()
forall a.
Parser ByteString a -> Parser ByteString a -> Parser ByteString a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
pDec)) ByteString
inp of
       Left [Char]
_      -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pDec :: A.Parser ()
pDec :: Parser ()
pDec = do
  Parser ()
mbMinus
  Parser ByteString ByteString -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser ByteString ByteString -> Parser ())
-> Parser ByteString ByteString -> Parser ()
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Parser ByteString ByteString
A.takeWhile1 ([Char] -> Char -> Bool
A.inClass [Char]
"0-9")

parseOct :: ByteString -> TokenizerM Text
parseOct :: ByteString -> TokenizerM Text
parseOct ByteString
inp = do
  Set Char -> ByteString -> TokenizerM ()
wordBoundary Set Char
forall a. Monoid a => a
mempty ByteString
inp
  case Parser (ByteString, ())
-> ByteString -> Either [Char] (ByteString, ())
forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pHex) ByteString
inp of
       Left [Char]
_      -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pOct :: A.Parser ()
pOct :: Parser ()
pOct = do
  Parser ()
mbMinus
  _ <- Char -> Parser Char
A.char Char
'0'
  _ <- A.satisfy (A.inClass "Oo")
  _ <- A.takeWhile1 (A.inClass "0-7")
  return ()

parseHex :: ByteString -> TokenizerM Text
parseHex :: ByteString -> TokenizerM Text
parseHex ByteString
inp = do
  Set Char -> ByteString -> TokenizerM ()
wordBoundary Set Char
forall a. Monoid a => a
mempty ByteString
inp
  case Parser (ByteString, ())
-> ByteString -> Either [Char] (ByteString, ())
forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pHex) ByteString
inp of
       Left [Char]
_      -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r) -- assumes ascii

pHex :: A.Parser ()
pHex :: Parser ()
pHex = do
  Parser ()
mbMinus
  _ <- Char -> Parser Char
A.char Char
'0'
  _ <- A.satisfy (A.inClass "Xx")
  _ <- A.takeWhile1 (A.inClass "0-9a-fA-F")
  return ()

mbMinus :: A.Parser ()
mbMinus :: Parser ()
mbMinus = (() () -> Parser Char -> Parser ()
forall a b. a -> Parser ByteString b -> Parser ByteString a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Char -> Parser Char
A.char Char
'-') Parser () -> Parser () -> Parser ()
forall a.
Parser ByteString a -> Parser ByteString a -> Parser ByteString a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () -> Parser ()
forall a. a -> Parser ByteString a
forall (m :: * -> *) a. Monad m => a -> m a
return ()

mbPlusMinus :: A.Parser ()
mbPlusMinus :: Parser ()
mbPlusMinus = () () -> Parser Char -> Parser ()
forall a b. a -> Parser ByteString b -> Parser ByteString a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ (Char -> Bool) -> Parser Char
A.satisfy ([Char] -> Char -> Bool
A.inClass [Char]
"+-") Parser () -> Parser () -> Parser ()
forall a.
Parser ByteString a -> Parser ByteString a -> Parser ByteString a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () -> Parser ()
forall a. a -> Parser ByteString a
forall (m :: * -> *) a. Monad m => a -> m a
return ()

parseFloat :: ByteString -> TokenizerM Text
parseFloat :: ByteString -> TokenizerM Text
parseFloat ByteString
inp = do
  Set Char -> ByteString -> TokenizerM ()
wordBoundary Set Char
forall a. Monoid a => a
mempty ByteString
inp
  case Parser (ByteString, ())
-> ByteString -> Either [Char] (ByteString, ())
forall a. Parser a -> ByteString -> Either [Char] a
A.parseOnly (Parser () -> Parser (ByteString, ())
forall a. Parser a -> Parser (ByteString, a)
A.match Parser ()
pFloat) ByteString
inp of
       Left [Char]
_      -> TokenizerM Text
forall a. TokenizerM a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
       Right (ByteString
r,()
_) -> Int -> TokenizerM Text
takeChars (ByteString -> Int
BS.length ByteString
r)  -- assumes all ascii
  where pFloat :: A.Parser ()
        pFloat :: Parser ()
pFloat = do
          let digits :: Parser ByteString ByteString
digits = (Char -> Bool) -> Parser ByteString ByteString
A.takeWhile1 ([Char] -> Char -> Bool
A.inClass [Char]
"0-9")
          Parser ()
mbPlusMinus
          before <- Bool -> Parser ByteString Bool -> Parser ByteString Bool
forall (f :: * -> *) a. Alternative f => a -> f a -> f a
A.option Bool
False (Parser ByteString Bool -> Parser ByteString Bool)
-> Parser ByteString Bool -> Parser ByteString Bool
forall a b. (a -> b) -> a -> b
$ Bool
True Bool -> Parser ByteString ByteString -> Parser ByteString Bool
forall a b. a -> Parser ByteString b -> Parser ByteString a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Parser ByteString ByteString
digits
          dot <- A.option False $ True <$ A.satisfy (A.inClass ".")
          after <- A.option False $ True <$ digits
          e <- A.option False $ True <$ (A.satisfy (A.inClass "Ee") >>
                                         mbPlusMinus >> digits)
          mbnext <- A.peekChar
          case mbnext of
               Maybe Char
Nothing -> () -> Parser ()
forall a. a -> Parser ByteString a
forall (m :: * -> *) a. Monad m => a -> m a
return ()
               Just Char
c  -> Bool -> Parser ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ [Char] -> Char -> Bool
A.inClass [Char]
"." Char
c)
          guard $ (before && not dot && e)     -- 5e2
               || (before && dot && (after || not e)) -- 5.2e2 or 5.2 or 5.
               || (not before && dot && after) -- .23 or .23e2