-- Hey Emacs, this is -*- Haskell -*- %{ module Scanner(tokens, Token(..), nameOfTag, Tag(..), Attr(..), TagAttr(..), attrToTagAttr ) where import Char (isSpace, toLower) import List (elem, elemIndex, lookup, partition) import Alex import Unicode data MyState = MyState { commentDepth :: Integer, inPre :: Bool, currentTagName :: String, allowedAttrs :: [Attr] } tokens :: String -> [Token] tokens inp = gscan hdoc_scan (MyState { commentDepth=0, inPre=False, currentTagName="", allowedAttrs=[] } ) inp hdoc_scan :: GScan MyState [Token] hdoc_scan = load_gscan (hdoc_acts, stop_act) hdoc_lx where -- when the scanner stops with start code "hd3" and the remaining -- input "" or "\n" then we need to add a EndHDoc token as the -- rule "^n %b ^-^- / ~^-" can't detect the end of the HDoc comment! stop_act p _ "" (sc,_) | sc `elem` [hd3, hd3Ty] = [EndHDoc p ""] | otherwise = [] stop_act p@(Pn addr ln col) _ inp (sc,_) | sc `elem` [hd3, hd3Ty] && inp == "\n" = [EndHDoc p inp] | otherwise = [ScanError msg] where msg = "scan error in line " ++ show ln ++ " before\n" ++ " " ++ take 80 inp setStartCode :: StartCode -> GTokenAction s r setStartCode newSC _ _ _ _ cont (_,state) = cont (newSC,state) %} -- %b is any whitespace except ^n -- %z white space possibly containing ^n%b*^-^-%b* -- %o is a operator symbol except ^. -- %i is an unqualified Haskell 98 identifier -- %j is like %i, but without parenthesis -- %n is a module or type constructor name ("Conid") -- %c is a constructor name -- %q a probably qualified function or type constructor or -- constructor name, probably in parenthesis { %b = [^t^r^ ] } { %z = (%b* (^n %b* ^-^- %b*)+) | %b+ } { %o = [^<^>^=^|^$^/^+^-^*^~] } { %i = (^_* [a-z] | ^_+[0-9]) [a-zA-Z0-9^_^']* | ( ^( (^. | %o%o | ^.%o | %o^. | (%o|^.){3,}) ^) ) } { %j = (^_* [a-z] | ^_+[0-9]) [a-zA-Z0-9^_^']* | ( ^. | %o%o | ^.%o | %o^. | (%o|^.){3,} ) } { %n = ^_* [A-Z] [a-zA-Z0-9^_^']* } { %c = %n | ^:%o* | ^:(^:|%o){2,} } { %q = %i | %c | ^( %n^.(%j|%c) ^) } -- { %h = (~[^-^@^n^<^>] | ^-~[^}^n^<])+ } "hdoc_lx"/"hdoc_acts":- <> ::= "0":. <> ::= "0":^n <> ::= "0":^-^- (~[^-^'] | ^-^-^-) .* ::= "0": ~^\ \ ^" %{ string = setStartCode str %} <> ::= "str":. <> ::= "str":^\ ^w* ^n ^w* ^\ <> ::= "str":^\^\ <> ::= "str":^\^" ::= "str":^" %{ endstring = setStartCode 0 %} ::= "0":^{^- %{ comment _ _ _ _ cont (sc,state) = cont (com, state {commentDepth=1}) %} ::= "com":^{^- %{ comment2 _ _ _ _ cont (sc,state) = cont (sc, state {commentDepth = commentDepth state + 1}) %} ::= "com":^-^} %{ endcomment _ _ _ _ cont (sc, state) = if commentDepth state == 1 then cont (0, state {commentDepth = 0}) else cont (com, state {commentDepth = commentDepth state - 1}) %} <> ::= "com":. <> ::= "com":^n <> ::= "0":^{^-^-^} -- old {--- ... -} comment format ::= "0":^{^-^-^- / ~^- %{ hdocOld = gTkn BeginHDoc hd %} ::= "0":^{^-^-^- ^w+ %{ hdocOldw = gTkn BeginHDoc hd %} ::= "hd":"hdTy":^-^} %{ endHDocOld = gTkn EndHDoc 0 %} -- don't drop empty lines (
 ... 
cares!) ::= "hd":^n %{ newline p _ inp len cont st@(sc,s) | inPre s = SText p inp "\n" : cont st | otherwise = SWhiteSpace p inp : cont st %} ::= "0":^-^-^' / ^w %{ hdocCons1 = gTkn BeginHDocCons hd3 %} ::= "0":^-^-^' / ~^- %{ hdocCons2 = gTkn BeginHDocCons hd3 %} ::= "0":^-^-^- / ~^- %{ hdoc3 = gTkn BeginHDoc hd3 %} ::= "0":^-^-^- %z %{ hdoc3w = gTkn BeginHDoc hd3 %} ::= "hd3":"hd3Ty": ^n %b* / (~^- | ^-~^- | ^-^-^-) %{ endhdoc3 = gTkn EndHDoc 0 %} -- skip "--" at the beginning of a new line ::= "hd3": ^n %b* ^-^- / ~^- %{ newline3 p _ inp len cont st@(sc,s) | inPre s = SText p inp "\n" : cont st | otherwise = SWhiteSpace p inp : cont st %} ::= "hd3": ^n %b* ^-^- %b+ %{ newline3w p _ inp len cont st@(sc,s) | inPre s = SText p inp ('\n':w) : cont st | otherwise = SWhiteSpace p inp : cont st where w = drop 2 $ dropWhile isSpace $ take len inp %} ::= "hd":"hd3":%b+ %{ blank p _ inp len cont st@(sc,s) | inPre s = SText p inp (take len inp) : cont st | otherwise = SWhiteSpace p inp : cont st %} ::= "hd":[a-zA-Z0-9^,^;^.^:^*^+^_^{^[^]^!^%^?^(^)^=^'^`^~^#^|^/^\^$]+ ::= "hd": ^-+ / ~[^}] %{ textOldM = text %} %{ textOld = text %} ::= "hd3": [a-zA-Z0-9^,^;^.^:^*^+^-^_^{^}^[^]^!^%^?^(^)^=^'^`^~^#^|^/^\^$]+ %{ text p _ inp len cont s = SText p inp (take len inp) : cont s %} ::= "hd":"hd3":^" | ^< | ^> %{ extSym p _ inp len cont s = SUnicode p inp q : cont s where Just q = lookup (take len inp) sym2u >>= html2unicode sym2u = [("\"", """), ("<", "<"), (">", ">")] %} ::= "hd":"hd3": ^&[a-zA-Z]*^; %{ namedSymbol p@(Pn _ ln col) _ inp len cont s = symbolToken : cont s where symbolToken = case html2unicode (take len inp) of Just uc -> SUnicode p inp uc Nothing -> ScanError ("scan error in line " ++ show ln ++ ":\n " ++ take len inp ++ " is unknown to HDoc") %}
    ::= "hd":"hd3": ^<[pP][rR][eE]^>
%{ pre p _ inp len cont (sc,s) = BeginTag p inp PreTag : cont (sc, s {inPre=True}) %}
 ::= "hd":"hd3": ^<^/[pP][rR][eE]^>
%{ endpre p _ inp len cont (sc,s) = EndTag p inp PreTag : cont (sc, s{inPre=False}) %}


::= "hd":"hd3": ^<(b|B)(r|R)(^/)?^> %{ br = sTkn SBr %} ::= "hd":"hd3": ^<[a-zA-Z]+ %{ tag p _ inp len cont (sc,state) = case lookup tagName tagNames of Just t -> BeginTag p inp t : cont (toTag sc, state { currentTagName = tagName, allowedAttrs = allowed }) where Just allowed = lookup t tagAttrs Nothing -> ScanError ("unknown HTML tag <" ++ tagName ++ ">") : cont (toTag sc, state { currentTagName = "(unknown tag)", allowedAttrs = allAttrs }) where tagName = map toLower $ take (len-1) $ drop 1 inp %} <> ::= "hdTag": %b | ^n <> ::= "hd3Tag": %z ::= "hdTag":"hd3Tag": [a-zA-Z]+ %{ tagAttrName p _ inp len cont s@(sc,state) = case lookup attrName attrNames of Nothing -> ScanError ("unknown attribute `" ++ attrName ++ "'") : cont s Just attr -> if attr `elem` allowedAttrs state then TagAttr p inp attr : cont s else ScanError ("attribute `" ++ attrName ++ "' not allowed\ \ with tag <" ++ currentTagName state ++ ">") : cont s where attrName = map toLower $ take len inp %} ::= "hdTag":"hd3Tag": ^= %{ tagEq = sTkn TagAttrEq %} ::= "hdTag":"hd3Tag": ^" [.#^"]* ^" %{ tagAttrValue p _ inp len cont s = TagAttrValue p inp (take (len-2) $ drop 1 inp) : cont s %} ::= "hdTag":"hd3Tag": ^> %{ tagClose p _ inp len cont (sc,state) = {-CloseTag p inp :-} cont (fromTag sc, state { currentTagName="", allowedAttrs=[] }) %} ::= "hd":"hd3": ^<^/ [a-zA-Z]+ ^> %{ tagEnd p _ inp len cont s = case lookup tagName tagNames of Just t -> EndTag p inp t : cont s Nothing -> ScanError ("unknown HTML tag ") : cont s where tagName = take (len-3) $ drop 2 inp %} ::= "hd":"hd3":^@cons %{ atcons = jTkn AtCons toTy %} ::= "hd":"hd3":^@param %{ atparam = jTkn AtParam toTy %} ::= "hd":"hd3":^@return %{ atret = jTkn AtReturn toTy %} ::= "hd":"hd3":^@monadic %{ atmona = jTkn AtMonadic toTy %} ::= "hd":"hd3":^@author / ^w %{ atauthor = sTkn AtAuthor %} ::= "hd":"hd3":^@version / ^w %{ atversion = sTkn AtVersion %} ::= "hd" :^@see ^w+ %q ^w* %{ atsee = atsee3 %} ::= "hd3":^@see %z %q %z? %{ atsee3 p _ inp len cont s = AtSee p inp ident : cont s where ident = takeWhile (not . isSpace) $ dropZ $ drop 4 $ take len inp %} ::= "hd" :^@doc ^w+ %q ^w* %{ atdoc = atdoc3 %} ::= "hd3":^@doc %z %q %z? %{ atdoc3 p _ inp len cont s = AtDoc p inp ident : cont s where ident = takeWhile (not . isSpace) $ dropZ $ drop 4 $ take len inp %} -- ignore whitespace in "typed" expressions (after @param etc.); -- ignore newline only when scanning a {--- ... -} comment <> ::= "hdTy":"hd3Ty":%b <> ::= "hdTy":^n <> ::= "hd3Ty" : ^n %b* ^-^- %b+ <> ::= "hd3Ty" : ^n %b* ^-^- / ~^- ::= "hdTy":"hd3Ty":^-^> %{ arrow = sTkn Arrow %} ::= "hdTy":"hd3Ty":^=^> %{ implies = sTkn Implies %} ::= "hdTy":"hd3Ty":^:^: %{ ht = sTkn HasType %} ::= "hdTy":"hd3Ty":^, %{ comma = sTkn Comma %} ::= "hdTy":"hd3Ty":^( %{ oparen = sTkn OParen %} ::= "hdTy":"hd3Ty":^) %{ cparen = sTkn CParen %} ::= "hdTy":"hd3Ty":^[ %{ obrack = sTkn OBracket %} ::= "hdTy":"hd3Ty":^] %{ cbrack = sTkn CBracket %} ::= "hdTy":"hd3Ty":^{ %{ obrace = sTkn OBrace %} ::= "hdTy":"hd3Ty":^} %{ cbrace = sTkn CBrace %} ::= "hdTy":"hd3Ty": %i ::= "hdTy":"hd3Ty": %c ::= "hdTy":"hd3Ty" :^- / ~[^>^}] %{ docFollows = mapMode fromTy %} ::= "hdTy":"hd3Ty" : $ / ^@ %{ backToHd = mapMode fromTy %} %{ -- skip %z -- ("-- --" would also be skipped, but %z doesn't match that) dropZ :: String -> String dropZ "" = "" dropZ ('-':'-':xs) = dropZ xs dropZ str@(x:xs) | isSpace x = dropZ xs | otherwise = str ident posn _ inp len cont st = Ident posn inp (take len inp) : cont st name posn _ inp len cont st = Conid posn inp (take len inp) : cont st gTkn :: (Posn -> String -> Token) -> StartCode -> GTokenAction MyState [Token] gTkn tfn newSC posn _ inp _ cont (_,s) = tfn posn inp : cont (newSC, s) jTkn :: (Posn -> String -> Token) -> (StartCode -> StartCode) -> GTokenAction MyState [Token] jTkn tfn f posn _ inp _ cont (sc,s) = tfn posn inp : cont (f sc, s) sTkn tfn posn _ inp _ cont st = tfn posn inp : cont st setMode mode _ _ _ _ cont (_,s) = cont (mode, s) mapMode f _ _ _ _ cont (sc,s) = cont (f sc, s) toTy :: StartCode -> StartCode toTy sc | sc == hd3 = hd3Ty | sc == hd = hdTy | otherwise = error "internal error in scanner: unexpected start code ?!" fromTy :: StartCode -> StartCode fromTy sc | sc == hd3Ty = hd3 | sc == hdTy = hd | otherwise = error "internal error in scanner: unexpected start code ?!" toTag :: StartCode -> StartCode toTag sc | sc == hd3 = hd3Tag | sc == hd = hdTag | otherwise = error "internal error in scanner: unexpected start code ?!" fromTag :: StartCode -> StartCode fromTag sc | sc == hd3Tag = hd3 | sc == hdTag = hd | otherwise = error "internal error in scanner: unexpected start code ?!" data Token = BeginHDoc { pos :: Posn, remainingInput :: String } | EndHDoc { pos :: Posn, remainingInput :: String } | BeginHDocCons { pos :: Posn, remainingInput :: String } | AtCons { pos :: Posn, remainingInput :: String } | AtParam { pos :: Posn, remainingInput :: String } | AtReturn { pos :: Posn, remainingInput :: String } | AtMonadic { pos :: Posn, remainingInput :: String } | Ident { pos :: Posn, remainingInput :: String, value :: String } | Conid { pos :: Posn, remainingInput :: String, value :: String } | Arrow { pos :: Posn, remainingInput :: String } | Implies { pos :: Posn, remainingInput :: String } | HasType { pos :: Posn, remainingInput :: String } | Comma { pos :: Posn, remainingInput :: String } | OParen { pos :: Posn, remainingInput :: String } | CParen { pos :: Posn, remainingInput :: String } | OBracket { pos :: Posn, remainingInput :: String } | CBracket { pos :: Posn, remainingInput :: String } | OBrace { pos :: Posn, remainingInput :: String } | CBrace { pos :: Posn, remainingInput :: String } | AtVersion { pos :: Posn, remainingInput :: String } | AtAuthor { pos :: Posn, remainingInput :: String } | AtSee { pos :: Posn, remainingInput :: String, value :: String } | AtDoc { pos :: Posn, remainingInput :: String, value :: String } | BeginTag { pos :: Posn, remainingInput :: String, theTag :: Tag } | EndTag { pos :: Posn, remainingInput :: String, theTag :: Tag } | TagAttr { pos :: Posn, remainingInput :: String, attribute :: Attr } | TagAttrValue { pos :: Posn, remainingInput :: String, attrValue :: String } | TagAttrEq { pos :: Posn, remainingInput :: String } | SBr { pos :: Posn, remainingInput :: String } | SUnicode { pos :: Posn, remainingInput :: String, uniVal :: Unicode } | SText { pos :: Posn, remainingInput :: String, value :: String } | SWhiteSpace { pos :: Posn, remainingInput :: String } | ScanError String deriving Show data Tag = CodeTag | PreTag | EmTag | BoldTag | ParagraphTag | TTTag | ItalicsTag | StrongTag | AnchorTag deriving (Eq, Show) data Attr = HrefAttr | ClassAttr deriving (Eq, Show) data TagAttr = TaHref String | TaClass String deriving (Eq, Show) attrToTagAttr :: Attr -> String -> Either String TagAttr attrToTagAttr HrefAttr val = Right $ TaHref val attrToTagAttr ClassAttr val = Right $ TaClass val allAttrs :: [Attr] allAttrs = [HrefAttr, ClassAttr] attrNames :: [(String, Attr)] attrNames = [("href", HrefAttr), ("class", ClassAttr) ] tagAttrs :: [(Tag, [Attr])] tagAttrs = [(ParagraphTag, [ClassAttr]), (AnchorTag, [HrefAttr]) ] tagNames :: [(String, Tag)] tagNames = [("p", ParagraphTag), ("code", CodeTag), ("em", EmTag), ("b", BoldTag), ("tt", TTTag), ("i", ItalicsTag), ("strong", StrongTag), ("a", AnchorTag) ] nameOfTag :: Tag -> String nameOfTag t = case [n | (n,t') <- tagNames, t' == t] of [] -> error ("Oops: Tag `" ++ show t ++ "' doesn't have a name?\n\ \ This is a bug in HDoc, please report it.") (x:_) -> x %}