#if PY_VERSION_HEX < 0x02020000 || !defined(Py_USING_UNICODE) #error "Python 2.2+ with unicode support required" #endif /* Static Definitions */ #define YYEMPTY -2 #define YYERROR -1 #define YYEOF 0 #define YYINITDEPTH 1000 #define LEXER_INITIAL_BACKTRACKS 20 /* Parsing objects */ typedef struct { PyObject_HEAD int verbose; PyObject *dict; } parserobject; typedef struct { PyObject *text; int last; int state; Py_UNICODE *end; Py_UNICODE *position; /* backtracking stack */ int backtracks; Py_UNICODE **positions; int allocated; } lexerobject; static int parser_yylex(parserobject *, lexerobject *, PyObject **); static lexerobject *lexer_new(PyObject *); static void lexer_free(lexerobject *); static int lexer_save_position(lexerobject *); static Py_UNICODE *lexer_restore_position(lexerobject *); static int lexer_charset(parserobject *, Py_UCS4 *, Py_UCS4, int); static int lexer_match(parserobject *, lexerobject *, Py_UCS4 *); static void lexer_error(lexerobject *); static char *unicode_escape(Py_UNICODE *, int); static PyObject *report_error(int state, PyObject *lval, lexerobject *lexer); static void print_reduce(int ruleno); static void print_state_stack(int *stack, int *end); /* Parser Methods */ #define TRACE(...) if (self->verbose) PySys_WriteStderr(__VA_ARGS__) #ifdef LEXER_DEBUG #define REGEX_TRACE(...) TRACE(__VA_ARGS__) #else #define REGEX_TRACE(...) #endif static char parser_parse_doc[] = "\ parse(string) -> object\n\ Converts the given string to a parse tree and return the top-most\n\ element of the tree."; static PyObject* parser_parse(register parserobject *self, PyObject *args) { PyObject *text; register int yystate; register int yyn; PyObject *yylval = NULL; PyObject *yyval = NULL; int state_stack[YYINITDEPTH]; int *state_ptr; PyObject *value_stack[YYINITDEPTH]; PyObject **value_ptr; int yylen; int yychar = YYEMPTY; /* cause a token to be read */ int yychar1 = 0; lexerobject *lexer; if (!PyArg_ParseTuple(args, "O:parse", &text)) return NULL; lexer = lexer_new(text); if (lexer == NULL) return NULL; TRACE("Starting parse\n"); /* Initialize stack pointers Waste one element of value and location stack so that they stay on the same level as the state stack. The wasted elements are never initialized. */ state_ptr = state_stack - 1; value_ptr = value_stack; yystate = 0; while (1) { /* Push a new state, which is found in yystate. */ /* In all cases, when you get here, the value and location stacks have just been pushed. So pushing a state here evens the stacks. */ *++state_ptr = yystate; TRACE("Entering state %d\n", yystate); /* Do appropriate processing given the current state. */ /* Read a lookahead token if we need one and don't already have one. */ /* First try to decide what to do without reference to lookahead token. */ yyn = action_idx[yystate]; if (yyn == YYFLAG) { yyn = default_action[yystate]; if (yyn == 0) { return report_error(yystate, yylval, lexer); } /* Do a reduction. yyn is the number of a rule to reduce with. */ if (self->verbose) print_reduce(yyn); yylen = rhs_size[yyn]; state_ptr -= yylen; value_ptr -= yylen; if (yylen > 0) yyval = value_ptr[1]; /* Action routines */ switch (yyn) { $$parser } if (!yyval) { lexer_free(lexer); return NULL; } *++value_ptr = yyval; if (self->verbose) print_state_stack(state_stack, state_ptr); /* Now "shift" the result of the reduction. */ /* Determine what state that goes to, based on the state we popped back to and the rule number reduced by. */ yyn = derives[yyn] - YYNTBASE; yystate = goto_idx[yyn] + *state_ptr; if (yystate >= 0 && yystate <= YYLAST && yycheck[yystate] == *state_ptr) { yystate = yytable[yystate]; } else { yystate = default_goto[yyn]; } continue; } /* if (yyn == YYFLAG) */ /* Not known => get a lookahead token if don't already have one. */ /* yychar is either YYEMPTY, YYEOF or a valid token in external form */ if (yychar == YYEMPTY) { TRACE("Reading a token: "); yychar = parser_yylex(self, lexer, &yylval); } /* Convert token to internal form (in yychar1) for indexing tables with */ if (yychar <= 0) { if (yychar == YYERROR) { lexer_free(lexer); return NULL; } /* This means end-of-input. */ yychar1 = 0; TRACE("Now at end of input.\n"); } else { yychar1 = YYTRANSLATE(yychar); TRACE("Next token is %d (%s)\n", yychar, token_names[yychar1]); yyn += yychar1; } if (yyn < 0 || yyn > YYLAST || yycheck[yyn] != yychar1) { /* comes here after end of input */ yyn = default_action[yystate]; if (yyn == 0) { return report_error(yystate, NULL, lexer); } /* Do a reduction. yyn is the number of a rule to reduce with. */ if (self->verbose) print_reduce(yyn); yylen = rhs_size[yyn]; state_ptr -= yylen; value_ptr -= yylen; if (yylen > 0) yyval = value_ptr[1]; /* Action routines */ switch (yyn) { $$parser } if (!yyval) { lexer_free(lexer); return NULL; } *++value_ptr = yyval; if (self->verbose) print_state_stack(state_stack, state_ptr); /* Now "shift" the result of the reduction. */ /* Determine what state that goes to, based on the state we popped back to and the rule number reduced by. */ yyn = derives[yyn] - YYNTBASE; yystate = goto_idx[yyn] + *state_ptr; if (yystate >= 0 && yystate <= YYLAST && yycheck[yystate] == *state_ptr) { yystate = yytable[yystate]; } else { yystate = default_goto[yyn]; } continue; } yyn = yytable[yyn]; /* yyn is what to do for this token type in this state. Negative => reduce, -yyn is rule number. Positive => shift, yyn is new state. New state is final state => don't bother to shift just return success. 0, or max negative number => error. */ if (yyn > YYFLAG && yyn < 0) { yyn = -yyn; /* Do a reduction. yyn is the number of a rule to reduce with. */ if (self->verbose) print_reduce(yyn); yylen = rhs_size[yyn]; state_ptr -= yylen; value_ptr -= yylen; if (yylen > 0) yyval = value_ptr[1]; /* Action routines */ switch (yyn) { $$parser } if (!yyval) { lexer_free(lexer); return NULL; } *++value_ptr = yyval; if (self->verbose) print_state_stack(state_stack, state_ptr); /* Now "shift" the result of the reduction. */ /* Determine what state that goes to, based on the state we popped back to and the rule number reduced by. */ yyn = derives[yyn] - YYNTBASE; yystate = goto_idx[yyn] + *state_ptr; if (yystate >= 0 && yystate <= YYLAST && yycheck[yystate] == *state_ptr) { yystate = yytable[yystate]; } else { yystate = default_goto[yyn]; } continue; } else if (yyn == YYFINAL) { /* Hooray! Process complete. */ lexer_free(lexer); return value_ptr[-1]; } else if (yyn <= 0) { /* Now it is either 0 or YYFLAG */ return report_error(yystate, yylval, lexer); } /* Shift the lookahead token. */ TRACE("Shifting token %d (%s), ", yychar, token_names[yychar1]); if (yychar != YYEOF) { yychar = YYEMPTY; } *++value_ptr = yylval; yystate = yyn; continue; } /* should never get here */ Py_INCREF(Py_None); lexer_free(lexer); return Py_None; } /** lexer routines ****************************************************/ static lexerobject *lexer_new(PyObject *text) { lexerobject *lexer; lexer = PyMem_New(lexerobject, 1); if (lexer == NULL) { PyErr_NoMemory(); return NULL; } /* attempt to coerce given object to unicode using default rules */ lexer->text = PyUnicode_FromObject(text); if (lexer->text == NULL) { PyMem_Free(lexer); return NULL; } lexer->position = PyUnicode_AS_UNICODE(lexer->text); lexer->end = lexer->position + PyUnicode_GET_SIZE(lexer->text); lexer->state = LEXER_INITIAL; /* create initial backtracking stack */ lexer->positions = PyMem_New(Py_UNICODE *, LEXER_INITIAL_BACKTRACKS); if (lexer->positions == NULL) { PyErr_NoMemory(); Py_DECREF(lexer->text); PyMem_Free(lexer); return NULL; } lexer->allocated = LEXER_INITIAL_BACKTRACKS; lexer->backtracks = 0; return lexer; } static void lexer_free(lexerobject *lexer) { PyMem_Free(lexer->positions); Py_DECREF(lexer->text); PyMem_Free(lexer); } static int lexer_save_position(lexerobject *lexer) { Py_UNICODE **positions; size_t new_allocated; int allocated, newsize; /* Bypass realloc() when a previous overallocation is large enough to accommodate the newsize. */ newsize = lexer->backtracks + 1; allocated = lexer->allocated; positions = lexer->positions; if (newsize >= allocated) { /* This over-allocates proportional to the list size, making room * for additional growth. The over-allocation is mild, but is * enough to give linear-time amortized behavior over a long * sequence of appends() in the presence of a poorly-performing * system realloc(). * The growth pattern is: 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ... */ new_allocated = (newsize >> 3) + (newsize < 9 ? 3 : 6) + newsize; if (PyMem_Resize(positions, Py_UNICODE *, new_allocated) == NULL) { PyErr_NoMemory(); return -1; } lexer->allocated = new_allocated; lexer->positions = positions; } lexer->positions[lexer->backtracks] = lexer->position; lexer->backtracks = newsize; return 0; } static Py_UNICODE *lexer_restore_position(lexerobject *lexer) { assert(lexer->backtracks > 0); lexer->position = lexer->positions[--lexer->backtracks]; return lexer->position; } static int lexer_charset(parserobject *self, Py_UCS4 *set, Py_UCS4 ch, int ok) { unsigned char *charset; /* check if character is a member of the given set */ /* Note, the tests are stored sorted to allow for quick exits */ for (;;) { switch (*set++) { case LEXER_CHARSET_LITERAL: /* */ REGEX_TRACE("CHARSET_LITERAL, %d == %d\n", ch, set[0]); if (ch < set[0]) return !ok; else if ((Py_UCS4)ch == set[0]) return ok; set++; break; case LEXER_CHARSET_RANGE: /* */ REGEX_TRACE("CHARSET_RANGE, %d <= %d <= %d\n", set[0], ch, set[1]); if (ch < set[0]) return !ok; else if (ch <= set[1]) return ok; set += 2; break; case LEXER_CHARSET_SMALL: /* */ REGEX_TRACE("CHARSET_SMALL, index=%d\n", set[0]); charset = lexer_charsets[*set++]; if (ch < 256 && (charset[ch >> 3] & (1 << (ch & 7)))) return ok; break; case LEXER_CHARSET_BIG: /* */ REGEX_TRACE("CHARSET_BIG, index=%d\n", set[0]); charset = lexer_charsets[lexer_blockmaps[*set++][ch >> 8]]; if (ch < 65536 && charset[(ch & 255) >> 3] & (1 << (ch & 7))) return ok; break; case LEXER_CHARSET_FAILURE: /* nothing matched in charset */ REGEX_TRACE("CHARSET_FAILURE\n"); return !ok; default: REGEX_TRACE("**INTERNAL CHARSET ERROR**\n"); return -1; } } } /* return values: 1 -> sucessful match, 0 -> no match, -1 -> error, */ #ifdef Py_UNICODE_WIDE #define GET_CHAR_AND_ADVANCE() ch = *ptr++; #else #define GET_CHAR_AND_ADVANCE() \ if ((0xD800 <= ptr[0] && ptr[0] <= 0xDBFF) && \ (0xDC00 <= ptr[1] && ptr[1] <= 0xDFFF)) { \ ch = (((ptr[0] & 0x03FF) << 10) | (ptr[1] & 0x03FF)) + 0x00010000; \ ptr += 2; \ } else { \ ch = *ptr++; \ } #endif static int lexer_match(parserobject *self, lexerobject *lexer, Py_UCS4 *pattern) { Py_UNICODE *ptr = lexer->position; Py_UNICODE *end; Py_UCS4 ch; int i, count; REGEX_TRACE("LEXER_MATCH, position %d\n", lexer->position - PyUnicode_AS_UNICODE(lexer->text)); while (1) { switch (*pattern++) { case LEXER_OP_FAILURE: /* immediate failure */ REGEX_TRACE("OP_FAILURE\n"); return 0; case LEXER_OP_SUCCESS: /* end of pattern */ REGEX_TRACE("OP_SUCCESS\n"); lexer->position = ptr; return 1; case LEXER_OP_BOL: /* beginning of line */ /* */ REGEX_TRACE("OP_BOL\n"); if (ptr == PyUnicode_AS_UNICODE(lexer->text) || ptr[-1] == '\n') break; return 0; case LEXER_OP_EOL: /* end of line */ /* */ REGEX_TRACE("OP_EOL\n"); if (ptr >= lexer->end || ptr[0] == '\n') break; return 0; case LEXER_OP_EOF: /* end of file */ /* */ REGEX_TRACE("OP_EOF\n"); if (ptr >= lexer->end) break; return 0; case LEXER_OP_ANY: /* match anything (except a newline) */ /* */ REGEX_TRACE("OP_ANY\n"); if (ptr >= lexer->end || ptr[0] == '\n') return 0; ptr++; break; case LEXER_OP_LITERAL: /* match literal character */ /* */ if (ptr >= lexer->end) return 0; GET_CHAR_AND_ADVANCE(); REGEX_TRACE("OP_LITERAL, %d == %d\n", ch, pattern[0]); if (ch != pattern[0]) return 0; pattern++; break; case LEXER_OP_NOT_LITERAL: /* match anything that is not literal character */ /* */ if (ptr >= lexer->end) return 0; GET_CHAR_AND_ADVANCE(); REGEX_TRACE("OP_NOT_LITERAL, %d != %d\n", ch, pattern[0]); if (ch == pattern[0]) return 0; pattern++; break; case LEXER_OP_CHARSET: /* match set member */ /* */ if (ptr >= lexer->end) return 0; GET_CHAR_AND_ADVANCE(); REGEX_TRACE("OP_CHARSET, skip %d\n", pattern[0]); i = lexer_charset(self, pattern + 1, ch, 1); if (i <= 0) return i; pattern += pattern[0]; break; case LEXER_OP_NOT_CHARSET: /* match set non-member */ /* */ if (ptr >= lexer->end) return 0; GET_CHAR_AND_ADVANCE(); REGEX_TRACE("OP_NOT_CHARSET, skip %d\n", pattern[0]); i = lexer_charset(self, pattern + 1, ch, 0); if (i <= 0) return i; pattern += pattern[0]; break; case LEXER_OP_ASSERT: /* lookahead assertion */ /* */ REGEX_TRACE("OP_ASSERT, skip %d\n", pattern[0]); lexer->position = ptr; i = lexer_match(self, lexer, pattern + 1); if (i <= 0) return i; pattern += pattern[0]; break; case LEXER_OP_BRANCH: /* alternation */ /* ... */ end = NULL; count = 0; while (pattern[0]) { /* reset start position each time through */ REGEX_TRACE("OP_BRANCH %d, skip %d\n", count++, pattern[0]); lexer->position = ptr; i = lexer_match(self, lexer, pattern + 1); if (i < 0) return i; else if (i && lexer->position > end) /* successful match which is longer than the current best matched */ end = lexer->position; /* advance to the next pattern */ pattern += pattern[0]; } /* advance pattern past NULL */ pattern++; /* advance to the best matching position if there was a match */ if (end) { lexer->position = ptr = end; break; } return 0; case LEXER_OP_REPEAT: /* repetition */ /* <1=min> item */ { Py_UCS4 *item = pattern + 2; Py_UCS4 *next = pattern + pattern[0]; int minimum = pattern[1]; int backtracks; lexer->position = ptr; for (count = 0, i = 1; i == 1 && count < minimum; count++) { REGEX_TRACE("OP_REPEAT, min %d, now %d\n", minimum, count); i = lexer_match(self, lexer, item); } /* either internal error or failed minimum matches */ if (i <= 0) return i; backtracks = lexer->backtracks; /* match as many items as possible */ for (; i == 1; count++) { REGEX_TRACE("OP_REPEAT, now %d\n", count); if (lexer_save_position(lexer) < 0) return -1; i = lexer_match(self, lexer, item); } if (i < 0) { /* internal error */ lexer->backtracks = backtracks; return i; } /* backtracking assert of tail match until success */ do { REGEX_TRACE("OP_REPEAT, now %d\n", count); /* update position to previous successful match */ ptr = lexer_restore_position(lexer); if (ptr == NULL) return -1; i = lexer_match(self, lexer, next); } while (i == 0 && --count > minimum); /* discard remaining backtrack positions */ lexer->backtracks = backtracks; if (i <= 0) { return i; } pattern = next; } break; case LEXER_OP_REPEAT_RANGE: /* repetition */ /* <1=min> <2=max> item */ { Py_UCS4 *item = pattern + 3; Py_UCS4 *next = pattern + pattern[0]; int minimum = pattern[1]; int maximum = pattern[2]; int backtracks; lexer->position = ptr; for (count = 0, i = 1; i == 1 && count < minimum; count++) { REGEX_TRACE("OP_REPEAT_RANGE, min %d, now %d\n", minimum, count); i = lexer_match(self, lexer, item); } /* either internal error or failed minimum matches */ if (i <= 0) return i; backtracks = lexer->backtracks; /* consume up to 'maximum' matches */ for (; i == 1 && count < maximum; count++) { REGEX_TRACE("OP_REPEAT_RANGE, max %d, now %d\n", maximum, count); if (lexer_save_position(lexer) < 0) return -1; i = lexer_match(self, lexer, item); } if (i < 0) { /* internal error */ lexer->backtracks = backtracks; return i; } /* maximum matches reached, update saved position */ if (i == 1) ptr = lexer->position; /* backtracking assert of tail match until success */ do { REGEX_TRACE("OP_REPEAT_RANGE, now %d\n", count); if (i == 0) { /* update position to last successful match */ ptr = lexer_restore_position(lexer); if (ptr == NULL) return -1; } i = lexer_match(self, lexer, next); } while (i == 0 && --count > minimum); /* discard remaining backtrack positions */ lexer->backtracks = backtracks; if (i <= 0) return i; pattern = next; } break; default: REGEX_TRACE("**INTERNAL MATCH ERROR**\n"); return -1; } } } static int parser_yylex(parserobject *self, lexerobject *lexer, PyObject **yylval) { int yychar = YYEMPTY; int yylen; Py_UNICODE *yytext = lexer->position; while (yytext < lexer->end && yychar == YYEMPTY) { Py_UNICODE *best_end = NULL; int yyaccept = 0; int i; Py_UCS4 **patterns = (Py_UCS4 **)lexer_patterns[lexer->state]; const int *actions = lexer_actions[lexer->state]; REGEX_TRACE("Using patterns from lexer state %d\n", lexer->state); for (i = 0; patterns[i]; i++) { int matched; /* reset position each time through */ lexer->position = yytext; REGEX_TRACE("--- pattern %d...\n", i); matched = lexer_match(self, lexer, patterns[i]); if (matched > 0 && lexer->position > best_end) { /* successful match which is longer than the current best matched */ best_end = lexer->position; yyaccept = i; } else if (matched < 0) { /* internal error */ REGEX_TRACE("--- pattern %d internal error\n", i); PyErr_SetString(PyExc_RuntimeError, "internal error in regular expression engine"); return -1; } REGEX_TRACE("--- pattern %d %s\n", i, matched ? "success" : "failed"); } if (best_end == NULL) { /* no matches */ lexer->position = yytext; lexer_error(lexer); return -1; } lexer->position = best_end; yylen = best_end - yytext; /* get the action block for this match */ switch (actions[yyaccept]) { $$lexer } } if (yychar == YYEMPTY) { /* Reached end of input */ yychar = YYEOF; } return yychar; } /** Type Object *******************************************************/ static int parser_traverse(parserobject *self, visitproc visit, void *arg) { int rv; if (self->dict) { rv = visit(self->dict, arg); if (rv != 0) return rv; } return 0; } static int parser_clear(parserobject *self) { PyObject *tmp; if (self->dict) { tmp = self->dict; self->dict = NULL; Py_DECREF(tmp); } return 0; } static void parser_dealloc(parserobject *self) { parser_clear(self); self->ob_type->tp_free((PyObject *) self); } static int parser_init(parserobject *self, PyObject *args, PyObject *kwds) { PyObject *debug=NULL; static char *kwlist[] = { "debug", NULL }; if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:" PARSER_NAME, kwlist, &debug)) return -1; if (debug) { self->verbose = PyObject_IsTrue(debug); } return 0; } static PyObject *parser_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { parserobject *self; self = (parserobject *) type->tp_alloc(type, 0); if (self != NULL) { self->dict = PyDict_New(); if (self->dict == NULL) { Py_DECREF(self); return NULL; } self->verbose = 0; } return (PyObject *) self; } static PyMethodDef parser_methods[] = { { "parse", (PyCFunction) parser_parse, METH_VARARGS, parser_parse_doc }, { NULL, NULL } }; static PyMemberDef parser_members[] = { { "debug", T_INT, offsetof(parserobject, verbose) }, { NULL } }; static char parser_doc[] = PARSER_NAME "\ ([debug]) -> parser\n\ Create a new parser object.\n\ \n\ The optional debug argument, when true, enables the builtin trace facility.\n\ The trace facility uses stderr to display each step taken by the parser."; static PyTypeObject Parser_Type = { /* PyObject_HEAD */ PyObject_HEAD_INIT(NULL) /* ob_size */ 0, /* tp_name */ PROJECT_NAME "." PARSER_NAME, /* tp_basicsize */ sizeof(parserobject), /* tp_itemsize */ 0, /* tp_dealloc */ (destructor) parser_dealloc, /* tp_print */ (printfunc) 0, /* tp_getattr */ (getattrfunc) 0, /* tp_setattr */ (setattrfunc) 0, /* tp_compare */ (cmpfunc) 0, /* tp_repr */ (reprfunc) 0, /* tp_as_number */ (PyNumberMethods *) 0, /* tp_as_sequence */ (PySequenceMethods *) 0, /* tp_as_mapping */ (PyMappingMethods *) 0, /* tp_hash */ (hashfunc) 0, /* tp_call */ (ternaryfunc) 0, /* tp_str */ (reprfunc) 0, /* tp_getattro */ (getattrofunc) 0, /* tp_setattro */ (setattrofunc) 0, /* tp_as_buffer */ (PyBufferProcs *) 0, /* tp_flags */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC, /* tp_doc */ (char *) parser_doc, /* tp_traverse */ (traverseproc) parser_traverse, /* tp_clear */ (inquiry) parser_clear, /* tp_richcompare */ (richcmpfunc) 0, /* tp_weaklistoffset */ 0, /* tp_iter */ (getiterfunc) 0, /* tp_iternext */ (iternextfunc) 0, /* tp_methods */ (PyMethodDef *) parser_methods, /* tp_members */ (PyMemberDef *) parser_members, /* tp_getset */ (PyGetSetDef *) 0, /* tp_base */ (PyTypeObject *) 0, /* tp_dict */ (PyObject *) 0, /* tp_descr_get */ (descrgetfunc) 0, /* tp_descr_set */ (descrsetfunc) 0, /* tp_dictoffset */ offsetof(parserobject, dict), /* tp_init */ (initproc) parser_init, /* tp_alloc */ (allocfunc) 0, /* tp_new */ (newfunc) parser_new, /* tp_free */ 0, }; /* Helper functions */ /* caller is responsible for releasing the memory */ static char *unicode_escape(Py_UNICODE *s, int len) { static const char *hexdigit = "0123456789ABCDEF"; char *repr, *p; int i, size; /* Do one pass to get the repr'ed size */ size = 1; /* zero terminator */ for (i = 0; i < len; i++) { #ifdef Py_UNICODE_WIDE if (s[i] >= 65536) size += 10; /* \UHHHHHHHH */ else #endif if (s[i] >= 256) size += 6; /* \uHHHH */ else if (s[i] == 9 || s[i] == 10 || s[i] == 13) size += 2; /* \t \n \r */ else if (s[i] < 32 || s[i] >= 128) size += 4; /* \xHH */ else size++; /* printable US-ASCII */ } repr = p = PyMem_New(char, size + 1); if (repr == NULL) return NULL; while (len-- > 0) { Py_UNICODE ch = *s++; #ifdef Py_UNICODE_WIDE /* Map 32-bit characters to '\Uxxxxxxxx' */ if (ch >= 65536) { *p++ = '\\'; *p++ = 'U'; *p++ = hexdigit[(ch >> 28) & 0xf]; *p++ = hexdigit[(ch >> 24) & 0xf]; *p++ = hexdigit[(ch >> 20) & 0xf]; *p++ = hexdigit[(ch >> 16) & 0xf]; *p++ = hexdigit[(ch >> 12) & 0xf]; *p++ = hexdigit[(ch >> 8) & 0xf]; *p++ = hexdigit[(ch >> 4) & 0xf]; *p++ = hexdigit[ch & 15]; } /* Map 16-bit characters to '\uxxxx' */ else #endif if (ch >= 256) { *p++ = '\\'; *p++ = 'u'; *p++ = hexdigit[(ch >> 12) & 0xf]; *p++ = hexdigit[(ch >> 8) & 0xf]; *p++ = hexdigit[(ch >> 4) & 0xf]; *p++ = hexdigit[ch & 15]; } /* Map special whitespace to '\t', \n', '\r' */ else if (ch == 9) { *p++ = '\\'; *p++ = 't'; } else if (ch == 10) { *p++ = '\\'; *p++ = 'n'; } else if (ch == 13) { *p++ = '\\'; *p++ = 'r'; } /* Map non-printable US ASCII to '\xhh' */ else if (ch < 32 || ch >= 128) { *p++ = '\\'; *p++ = 'x'; *p++ = hexdigit[(ch >> 4) & 0xf]; *p++ = hexdigit[ch & 15]; } /* Copy everything else as-is */ else *p++ = (char) ch; } *p = '\0'; return repr; } static void calculate_position(lexerobject *lexer, int *line, int *column) { /* Determine line and column numbers */ Py_UNICODE *p; *line = 1; *column = 1; for (p = PyUnicode_AS_UNICODE(lexer->text); p < lexer->end; p++) { if ((char)*p == '\n') { *line += 1; *column = 1; } else { *column += 1; } } } static const char error_format_str[] = "parse error at line %d, column %d: matched '%s'"; static const char error_format_eof_str[] = "parse error at line %d, column %d: reached end-of-input"; static PyObject *report_error(int state, PyObject* lval, lexerobject *lexer) { int line, column; int ruleno = action_idx[state]; char *matched = NULL; if (lval) { matched = unicode_escape(PyUnicode_AS_UNICODE(lval), PyUnicode_GET_SIZE(lval)); if (matched == NULL) return NULL; } calculate_position(lexer, &line, &column); Py_DECREF(lexer->text); if (ruleno > YYFLAG && ruleno < YYLAST) { /* There are expected tokens */ int x, count; int size = 60; /* Initial format string */ char *msg; /* Start X at -yyn if nec to avoid negative indexes in yycheck. */ for (x = (ruleno < 0 ? -ruleno : 0); x < (sizeof(token_names) / sizeof(char *)); x++) { if (yycheck[x + ruleno] == x) { size += strlen(token_names[x]) + 15; } } msg = PyMem_New(char, size); if (msg == NULL) { PyMem_Del(matched); return NULL; } if (lval) { strcpy(msg, error_format_str); } else { strcpy(msg, error_format_eof_str); } count = 0; for (x = (ruleno < 0 ? -ruleno : 0); x < (sizeof(token_names) / sizeof(char *)); x++) { if (yycheck[x + ruleno] == x) { strcat(msg, count == 0 ? ", expecting '" : " or '"); strcat(msg, token_names[x]); strcat(msg, "'"); count++; } } if (matched) { PyErr_Format(PyExc_SyntaxError, msg, line, column, matched); } else { PyErr_Format(PyExc_SyntaxError, msg, line, column); } PyMem_Del(msg); } else { if (matched) { PyErr_Format(PyExc_SyntaxError, error_format_str, line, column, matched); } else { PyErr_Format(PyExc_SyntaxError, error_format_eof_str, line, column); } } if (matched) { PyMem_Del(matched); } return NULL; } static const char lexer_error_str[] = "lexical error at line %d, column %d: no action found for '%s'"; static void lexer_error(lexerobject *lexer) { int line, column; char *repr = unicode_escape(lexer->position, (lexer->end - lexer->position)); if (repr == NULL) return; calculate_position(lexer, &line, &column); PyErr_Format(PyExc_SyntaxError, lexer_error_str, line, column, repr); PyMem_Del(repr); return; } static void print_reduce(int ruleno) { int count; const int *token; PySys_WriteStderr("Reducing via rule %d (%s), ", ruleno, rule_info[ruleno]); /* print the symbols being reduced and their result. */ count = ruleno; token = rhs_tokens; while (--count) while (*++token); while (*++token) { PySys_WriteStderr("%s ", token_names[*token]); } PySys_WriteStderr("-> %s\n", token_names[derives[ruleno]]); } static void print_state_stack(int *stack, int *end) { int *curr = stack; PySys_WriteStderr("state stack now"); while (curr <= end) { PySys_WriteStderr(" %d", *curr++); } PySys_WriteStderr("\n"); } static PyObject *import_from(char *modulename, char *fromname) { PyObject *fromlist, *name, *module; fromlist = PyTuple_New(1); if (fromlist == NULL) return NULL; name = PyString_FromString(fromname); if (name == NULL) { Py_DECREF(fromlist); return NULL; } Py_INCREF(name); PyTuple_SET_ITEM(fromlist, 0, name); module = PyImport_ImportModuleEx(modulename, NULL, NULL, fromlist); Py_DECREF(fromlist); if (module == NULL) { Py_DECREF(name); return NULL; } fromlist = PyObject_GetAttr(module, name); Py_DECREF(module); Py_DECREF(name); return fromlist; } static PyMethodDef module_methods[] = { { NULL } };