/[ascend]/branches/hugo/ocaml/lexer.mll
ViewVC logotype

Contents of /branches/hugo/ocaml/lexer.mll

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3167 - (show annotations) (download)
Tue Aug 16 10:54:02 2016 UTC (2 years, 3 months ago) by hugo
File size: 14733 byte(s)
Alternate parser, written in Ocaml

This commit contains an experimental parser for ASCEND, written in Ocaml

1 {
2 open Parser
3 open Types
4
5 open Lexing
6 open Containers
7
8 let update_line_number lexbuf =
9 Lexing.lexeme lexbuf |> String.iter (fun c ->
10 if c = '\n' then Lexing.new_line lexbuf
11 )
12 }
13
14 let blank = ['\x0C' '\x0D' '\x09' '\x0B' '\x20'] (*[\f\r\t\v ]*)
15 let digit = ['0'-'9']
16 let letter = ['a'-'z' 'A'-'Z']
17
18 let exp = (['e''E']['-''+']? digit+)
19 let real = ((((digit+ "." digit*)|("." digit+)) exp?) | (digit+ exp))
20 let integer = digit+
21 let idChar = letter (integer|letter|'_')*
22
23 rule
24 initial = parse
25 | eof { EOF }
26
27 (* Multichar operators *)
28 | "<=" { LEQ }
29 | ">=" { GEQ }
30 | "<>" { NEQ }
31 | ".." { DOTDOT }
32 | "::" { DBLCOLON }
33 | ":=" { ASSIGN }
34 | ":==" { CASSIGN }
35 | "==" { BEQ }
36 | "!=" { BNE }
37 (* Single-char operators and tokens *)
38 | "=" { EQ }
39 | ">" { GT }
40 | "<" { LT }
41 | "," { COMMA }
42 | "." { DOT }
43 | ";" { SEMICOLON }
44 | ":" { COLON }
45 | "[" { LBRACKET }
46 | "]" { RBRACKET }
47 | "(" { LPAREN }
48 | ")" { RPAREN }
49 | "+" { PLUS }
50 | "-" { MINUS }
51 | "*" { TIMES }
52 | "/" { DIV }
53 | "^" { CIRCUMFLEX }
54 | "|" { PIPE }
55 (* Reserverd Words *)
56 | idChar as name {
57 match name with
58 | "ADD" -> ADD
59 | "ALIASES" -> ALIASES
60 | "AND" -> AND
61 | "ANY" -> ANY
62 | "ARE_ALIKE" -> AREALIKE
63 | "ARE_THE_SAME" -> ARETHESAME
64 (*| "ARRAY" -> ARRAY *)
65 | "ASSERT" -> ASSERT
66 | "ATOM" -> ATOM
67 | "BREAK" -> BREAK
68 | "CALL" -> CALL
69 | "CARD" -> CARD
70 | "CASE" -> CASE
71 | "CHECK" -> CHECK
72 | "CHILDREN" -> CHILDREN
73 | "CHOICE" -> CHOICE
74 | "CONDITIONAL" -> CONDITIONAL
75 | "CONSTANT" -> CONSTANT
76 | "CONTINUE" -> CONTINUE
77 | "CREATE" -> CREATE
78 (*| "DATA" -> DATA *)
79 | "DECREASING" -> DECREASING
80 | "DEFAULT" -> DEFAULT
81 | "DEFINITION" -> DEFINITION
82 | "DER" -> DER
83 | "der" -> DERIV
84 | "DERIVATIVE" -> DERIVATIVE
85 | "DIMENSION" -> DIMENSION
86 | "DIMENSIONLESS" -> DIMENSIONLESS
87 | "DO" -> DO
88 | "ELSE" -> ELSE
89 | "END" -> END
90 | "EVENT" -> EVENT
91 | "EXPECT" -> EXPECT
92 | "EXTERNAL" -> EXTERNAL
93 | "FALSE" -> FALSE
94 | "FALL_THROUGH" -> FALLTHRU
95 | "FIX" -> FIX
96 | "FOR" -> FOR
97 | "FREE" -> FREE
98 | "FROM" -> FROM
99 | "GLOBAL" -> GLOBAL
100 | "IF" -> IF
101 | "'ignore'" -> IGNORE
102 | "IMPORT" -> IMPORT
103 | "IN" -> IN
104 | "INCREASING" -> INCREASING
105 | "INDEPENDENT" -> INDEPENDENT
106 (*| "INPUT" -> INPUT *)
107 | "INTERSECTION" -> INTERSECTION
108 | "IS_A" -> ISA
109 | "IS_REFINED_TO" -> ISREFINEDTO
110 | "LIKE" -> LIKE
111 | "LINK" -> LINK
112 | "MAXIMIZE" -> MAXIMIZE
113 | "MAX_INTEGER" -> MAXINTEGER
114 | "MAX_REAL" -> MAXREAL
115 | "METHOD" -> METHOD
116 | "METHODS" -> METHODS
117 | "MINIMIZE" -> MINIMIZE
118 | "MODEL" -> MODEL
119 | "NOT" -> NOT
120 | "NOTES" -> NOTES
121 | "OF" -> OF
122 | "OPTION" -> OPTION
123 | "OR" -> OR
124 | "OTHERWISE" -> OTHERWISE
125 (*| "OUTPUT" -> OUTPUT *)
126 | "pre" -> PRE
127 | "PREVIOUS" -> PREVIOUS
128 | "PROD" -> PROD
129 | "PROVIDE" -> PROVIDE
130 | "REFINES" -> REFINES
131 | "REPLACE" -> REPLACE
132 | "REQUIRE" -> REQUIRE
133 | "RETURN" -> RETURN
134 | "RUN" -> RUN
135 | "SATISFIED" -> SATISFIED
136 | "SELECT" -> SELECT
137 (*| "SIZE" -> SIZE *)
138 | "SOLVE" -> SOLVE
139 | "SOLVER" -> SOLVER
140 | "STOP" -> STOP
141 | "SUCH_THAT" -> SUCHTHAT
142 | "SUM" -> SUM
143 | "SWITCH" -> SWITCH
144 | "THEN" -> THEN
145 | "TRUE" -> TRUE
146 | "UNION" -> UNION
147 | "UNITS" -> UNITS
148 | "UNIVERSAL" -> UNIVERSAL
149 | "UNLINK" -> UNLINK
150 | "USE" -> USE
151 (*| "VALUE" -> VALUE *)
152 | "WHEN" -> WHEN
153 | "WHERE" -> WHERE
154 | "WHILE" -> WHILE
155 | "WILL_BE" -> WILLBE
156 | "WILL_BE_THE_SAME" -> WILLBETHESAME
157 | "WILL_NOT_BE_THE_SAME" -> WILLNOTBETHESAME
158 | "WITH" -> WITH
159 | "WITH_VALUE" -> WITH_VALUE
160
161 | _ -> (IDENTIFIER (Identifier name))
162 }
163
164 | "(*" { comment 1 lexbuf; initial lexbuf }
165 | "\'" { symbol (Buffer.create 20) lexbuf }
166 | "\"" { doublequote (Buffer.create 20) lexbuf }
167
168 | "{" (blank* '\n')? {
169 update_line_number lexbuf;
170 bracedtext 1 (Buffer.create 20) lexbuf
171 }
172
173 (* We do not want 1..2 to parse as 1. followed by .2
174 * Se we use a big hack to pretend ocamllex has lookahead *)
175 | (integer as nstr) ".." {
176 lexbuf.lex_curr_pos <- lexbuf.lex_curr_pos - 2;
177 (INTEGER (int_of_string nstr))
178 }
179
180 | integer as nstr { (INTEGER (int_of_string nstr)) }
181 | real as rstr { (REAL (float_of_string rstr)) }
182
183 | blank* { update_line_number lexbuf; initial lexbuf }
184 | '\n' { update_line_number lexbuf; initial lexbuf }
185
186 | _ as c {
187 let msg = Printf.sprintf "Unexpected char '%c'" c in
188 raise (Types.LexerError msg)
189 }
190
191 and
192 comment nesting = parse
193 | eof { raise (Types.LexerError "Unfinished comment") }
194 | "(*" { comment (nesting + 1) lexbuf }
195 | "*)" { if nesting == 1 then () else comment (nesting - 1) lexbuf }
196 | _ { comment nesting lexbuf }
197
198 and
199 symbol strbuf = parse
200 | eof { raise (Types.LexerError "Unfinished symbol") }
201 | '\n' { raise (Types.LexerError "Unfinished symbol") }
202 | '\'' { (SYMBOL (Symbol (Buffer.contents strbuf))) }
203 | _ as c {
204 update_line_number lexbuf;
205 Buffer.add_char strbuf c;
206 symbol strbuf lexbuf
207 }
208
209 and
210 doublequote strbuf = parse
211 | eof { raise (Types.LexerError "Unfinished string") }
212 | '"' { (DQUOTE (DQuote (Buffer.contents strbuf))) }
213 | '\\' {
214 Buffer.add_string strbuf (backslash lexbuf);
215 doublequote strbuf lexbuf
216 }
217 | _ as c {
218 update_line_number lexbuf;
219 Buffer.add_char strbuf c;
220 doublequote strbuf lexbuf
221 }
222
223 and
224 bracedtext nesting strbuf = parse
225 | eof { raise (Types.LexerError "Unfinished braced text") }
226 | '{' as c {
227 Buffer.add_char strbuf c;
228 bracedtext (nesting + 1) strbuf lexbuf
229 }
230 | '}' as c {
231 if nesting == 1 then begin
232 (BRACEDTEXT (BracedText (Buffer.contents strbuf)))
233 end else begin
234 Buffer.add_char strbuf c;
235 bracedtext (nesting - 1) strbuf lexbuf
236 end
237 }
238 | '\\' {
239 Buffer.add_string strbuf (backslash lexbuf);
240 bracedtext nesting strbuf lexbuf
241 }
242 | _ as c {
243 update_line_number lexbuf;
244 Buffer.add_char strbuf c;
245 bracedtext nesting strbuf lexbuf
246 }
247
248
249 and
250 backslash = parse
251 | 'a' { "\x07" }
252 | 'b' { "\x08" }
253 | 'f' { "\x0C" }
254 | 'n' { "\x0A" }
255 | 'r' { "\x0D" }
256 | 't' { "\x09" }
257 | 'v' { "\x0B" }
258 | 'x' { raise (Types.LexerError "\\xhh escapes not implemented yet") }
259 | ['0'-'7'] { raise (Types.LexerError "Octal escapes not implemented yet") }
260 | '\n' {
261 update_line_number lexbuf;
262 "\n"
263 }
264 | _ as c { String.make 1 c }
265
266
267 {
268
269 let ascend_quote ldelim rdelim specialset =
270 let re = Str.regexp "[{}]" in
271 fun str -> ldelim ^ Str.global_replace re "\\\\\\0" str ^ rdelim
272
273 let quote_ascend_bracedtext = ascend_quote "{" "}" "[{}]"
274 let quote_ascend_symbol = ascend_quote "\'" "\'" "[\']"
275 let quote_ascend_dquote = ascend_quote "\"" "\"" "[\"]"
276
277
278 (* Convert token back to how it looked before we lexed it (as much as we can)
279 * This conversion is not guaranteed to roundtrip 100% because:
280 * - Some things may be represented multiple ways (ex.: 100.0 vs 1.0e2),
281 * - Lexing throws aways some things (comments, leading space in braced text, etc)
282 * - I'm using Ocaml's string_of_XXX functions instead of functions tuned for ASCEND
283 * - and maybe more... *)
284 let string_of_token = function
285 | EOF -> ""
286 | LEQ -> "<="
287 | GEQ -> ">="
288 | NEQ -> "<>"
289 | DOTDOT -> ".."
290 | DBLCOLON -> "::"
291 | ASSIGN -> ":="
292 | CASSIGN -> ":=="
293 | BEQ -> "=="
294 | BNE -> "!="
295
296 | EQ -> "="
297 | LT -> ">"
298 | GT -> "<"
299 | COMMA -> ","
300 | DOT -> "."
301 | SEMICOLON -> ";"
302 | COLON -> ":"
303 | LBRACKET -> "["
304 | RBRACKET -> "]"
305 | LPAREN -> "("
306 | RPAREN -> ")"
307 | PLUS -> "+"
308 | MINUS -> "-"
309 | TIMES -> "*"
310 | DIV -> "/"
311 | CIRCUMFLEX -> "^"
312 | PIPE -> "|"
313
314 | ADD -> "ADD"
315 | ALIASES -> "ALIASES"
316 | AND -> "AND"
317 | ANY -> "ANY"
318 | AREALIKE -> "ARE_ALIKE"
319 | ARETHESAME -> "ARE_THE_SAME"
320 (*| ARRAY -> "ARRAY"*)
321 | ASSERT -> "ASSERT"
322 | ATOM -> "ATOM"
323 | BREAK -> "BREAK"
324 | CALL -> "CALL"
325 | CARD -> "CARD"
326 | CASE -> "CASE"
327 | CHOICE -> "CHOICE"
328 | CHECK -> "CHECK"
329 | CHILDREN -> "CHILDREN"
330 | CONDITIONAL -> "CONDITIONAL"
331 | CONSTANT -> "CONSTANT"
332 | CONTINUE -> "CONTINUE"
333 | CREATE -> "CREATE"
334 (*|DATA -> "DATA"*)
335 | DECREASING -> "DECREASING"
336 | DEFAULT -> "DEFAULT"
337 | DEFINITION -> "DEFINITION"
338 | DER -> "DER"
339 | DERIV -> "der"
340 | DERIVATIVE -> "DERIVATIVE"
341 | DIMENSION -> "DIMENSION"
342 | DIMENSIONLESS -> "DIMENSIONLESS"
343 | DO -> "DO"
344 | ELSE -> "ELSE"
345 | END -> "END"
346 | EVENT -> "EVENT"
347 | EXPECT -> "EXPECT"
348 | EXTERNAL -> "EXTERNAL"
349 | FALSE -> "FALSE"
350 | FALLTHRU -> "FALL_THROUGH"
351 | FIX -> "FIX"
352 | FOR -> "FOR"
353 | FREE -> "FREE"
354 | FROM -> "FROM"
355 | GLOBAL -> "GLOBAL"
356 | IF -> "IF"
357 | IGNORE -> "'ignore'"
358 | IMPORT -> "IMPORT"
359 | IN -> "IN"
360 | INCREASING -> "INCREASING"
361 | INDEPENDENT -> "INDEPENDENT"
362 (*| INPUT -> "INPUT"*)
363 | INTERSECTION -> "INTERSECTION"
364 | IS -> "IS"
365 | ISA -> "IS_A"
366 | ISREFINEDTO -> "IS_REFINED_TO"
367 | LIKE -> "LIKE"
368 | LINK -> "LINK"
369 | MAXIMIZE -> "MAXIMIZE"
370 | MAXINTEGER -> "MAX_INTEGER"
371 | MAXREAL -> "MAX_REAL"
372 | METHOD -> "METHOD"
373 | METHODS -> "METHODS"
374 | MINIMIZE -> "MINIMIZE"
375 | MODEL -> "MODEL"
376 | NOT -> "NOT"
377 | NOTES -> "NOTES"
378 | OF -> "OF"
379 | OPTION -> "OPTION"
380 | OR -> "OR"
381 | OTHERWISE -> "OTHERWISE"
382 (*| OUTPUT -> "OUTPUT"*)
383 | PATCH -> "PATCH"
384 | PRE -> "pre"
385 | PREVIOUS -> "PREVIOUS"
386 | PROD -> "PROD"
387 | PROVIDE -> "PROVIDE"
388 | REFINES -> "REFINES"
389 | REPLACE -> "REPLACE"
390 | REQUIRE -> "REQUIRE"
391 | RETURN -> "RETURN"
392 | RUN -> "RUN"
393 | SATISFIED -> "SATISFIED"
394 | SELECT -> "SELECT"
395 (*| SIZE -> "SIZE"*)
396 | SOLVE -> "SOLVE"
397 | SOLVER -> "SOLVER"
398 | STOP -> "STOP"
399 | SUCHTHAT -> "SUCHTHAT"
400 | SUM -> "SUM"
401 | SWITCH -> "SWITCH"
402 | THEN -> "THEN"
403 | TRUE -> "TRUE"
404 | UNION -> "UNION"
405 | UNITS -> "UNITS"
406 | UNIVERSAL -> "UNIVERSAL"
407 | UNLINK -> "UNLINK"
408 | USE -> "USE"
409 (*| VALUE -> "VALUE"*)
410 | WHEN -> "WHEN"
411 | WHERE -> "WHERE"
412 | WHILE -> "WHILE"
413 | WILLBE -> "WILL_BE"
414 | WILLBETHESAME -> "WILL_BE_THE_SAME"
415 | WILLNOTBETHESAME -> "WILL_NOT_BE_THE_SAME"
416 | WITH -> "WITH"
417 | WITH_VALUE -> "WITH_VALUE"
418 | REAL x -> string_of_float x
419 | INTEGER x -> string_of_int x
420 | IDENTIFIER (Identifier x) -> x
421 | BRACEDTEXT (BracedText x) -> quote_ascend_bracedtext x
422 | SYMBOL (Symbol x) -> quote_ascend_symbol x
423 | DQUOTE (DQuote x) -> quote_ascend_dquote x
424
425
426
427 (* Stringify token in the format used by "menhir --interpret" *)
428 (* This can be helpful if you need to find out why the grammar is giving syntax errors *)
429 let debug_string_of_token = function
430 | EOF -> "EOF"
431 | LEQ -> "LEQ"
432 | GEQ -> "GEQ"
433 | NEQ -> "NEQ"
434 | DOTDOT -> "DOTDOT"
435 | DBLCOLON -> "DBLCOLON"
436 | ASSIGN -> "ASSIGN"
437 | CASSIGN -> "CASSIGN"
438 | BEQ -> "BEQ"
439 | BNE -> "BNE"
440
441 | EQ -> "EQ"
442 | LT -> "LT"
443 | GT -> "GT"
444 | COMMA -> "COMMA"
445 | DOT -> "DOT"
446 | SEMICOLON -> "SEMICOLON"
447 | COLON -> "COLON"
448 | LBRACKET -> "LBRACKET"
449 | RBRACKET -> "RBRACKET"
450 | LPAREN -> "LPAREN"
451 | RPAREN -> "RPAREN"
452 | PLUS -> "PLUS"
453 | MINUS -> "MINUS"
454 | TIMES -> "TIMES"
455 | DIV -> "DIV"
456 | CIRCUMFLEX -> "CIRCUMFLEX"
457 | PIPE -> "PIPE"
458
459 | AREALIKE -> "AREALIKE"
460 | ARETHESAME -> "ARETHESAME"
461 | DERIV -> "DERIV"
462 | FALLTHRU -> "FALLTHRU"
463 | IGNORE -> "IGNORE"
464 | ISA -> "ISA"
465 | ISREFINEDTO -> "ISREFINEDTO"
466 | MAXINTEGER -> "MAXINTEGER"
467 | PRE -> "PRE"
468 | WILLBE -> "WILLBE"
469 | WILLBETHESAME -> "WILLBETHESAME"
470 | WILLNOTBETHESAME -> "WILLNOTBETHESAME"
471 | WITH -> "WITH"
472 | WITH_VALUE -> "WITH_VALUE"
473
474 | REAL _ -> "REAL"
475 | INTEGER _ -> "INTEGER"
476 | IDENTIFIER _ -> "IDENTIFIER"
477 | BRACEDTEXT _ -> "BRACEDTEXT"
478 | SYMBOL _ -> "SYMBOL"
479 | DQUOTE _ -> "DQUOTE"
480
481 | tok -> string_of_token tok
482 }

john.pye@anu.edu.au
ViewVC Help
Powered by ViewVC 1.1.22