*** Lib/regex_syntax.py Tue Dec 31 01:02:51 1991 --- Lib/regex_syntax.py Sun Sep 15 08:36:41 1996 *************** *** 32,39 **** --- 32,50 ---- # *, +, ? - only special when not after the beginning, (, or | RE_CONTEXT_INDEP_OPS = 32 + # Treat \w as [A-Za-z0-9_] instead of just [A-Za-z0-9], \W as [^A-Za-z0-9_], + # \d as [0-9], \D as [^0-9], \s as [ \t\r\n\f], \S as [^ \t\r\n\f], like Perl. + # Additionally, treat \h as a hex digit: [0-9a-fA-F] and \H as [^0-9a-fA-F], + # and \l as a letter of the alphabet: [A-Za-z] and \L as [^A-Za-z]. + RE_EXTRA_CLASSES = 256 + + # Allow the minimal quantifying operators ??, +?, and *? which will match + # the shortest possible piece of the search string instead of the longest. + RE_MINIMAL_OPS = 512 + # Now define combinations of bits for the standard possibilities. RE_SYNTAX_AWK = (RE_NO_BK_PARENS | RE_NO_BK_VBAR | RE_CONTEXT_INDEP_OPS) + RE_SYNTAX_PERLISH = (RE_SYNTAX_AWK | RE_EXTRA_CLASSES | RE_MINIMAL_OPS) RE_SYNTAX_EGREP = (RE_SYNTAX_AWK | RE_NEWLINE_OR) RE_SYNTAX_GREP = (RE_BK_PLUS_QM | RE_NEWLINE_OR) RE_SYNTAX_EMACS = 0 *** Modules/regexpr.c Fri Sep 13 02:57:39 1996 --- Modules/regexpr.c Sun Sep 15 08:31:49 1996 *************** *** 106,111 **** --- 106,119 ---- Rwordend, /* end of word */ Rwordbound, /* word bound */ Rnotwordbound, /* not word bound */ + Rdigitchar, /* P - digit character */ + Rnotdigitchar, /* P - not digit character */ + Rhexdigitchar, /* P - hexadecimal digit character */ + Rnothexdigitchar, /* P - not hexadecimal digit character */ + Rwhitespacechar, /* P - whitespace character */ + Rnotwhitespacechar, /* P - not whitespace character */ + Rletterchar, /* P - letter of alphabet */ + Rnotletterchar, /* P - not letter of alphabet */ #ifdef emacs Remacs_at_dot, /* emacs: at dot */ Remacs_syntaxspec, /* syntaxspec */ *************** *** 122,127 **** --- 130,136 ---- static unsigned char regexp_precedences[Rnum_ops]; static int regexp_context_indep_ops; static int regexp_ansi_sequences; + static int regexp_minimal_ops; /* P - added this flag */ #define NUM_LEVELS 5 /* number of precedence levels in use */ #define MAX_NESTING 100 /* max nesting level of operators */ *************** *** 143,148 **** --- 152,161 ---- #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)] #define Sword 1 + #define Sdigit 2 + #define Shexdigit 4 + #define Swhitespace 8 + #define Sletter 16 #ifdef SYNTAX_TABLE char *re_syntax_table; *************** *** 164,177 **** { syntax_table_inited = 1; memset(re_syntax_table, 0, 256); ! for (a = 'a'; a <= 'z'; a++) ! re_syntax_table[a] = Sword; for (a = 'A'; a <= 'Z'; a++) ! re_syntax_table[a] = Sword; ! for (a = '0'; a <= '9'; a++) ! re_syntax_table[a] = Sword; } #endif /* !emacs && !SYNTAX_TABLE */ re_compile_initialized = 1; for (a = 0; a < 256; a++) { --- 177,210 ---- { syntax_table_inited = 1; memset(re_syntax_table, 0, 256); ! for (a = 'a'; a <= 'z'; a++) /* P - added Sletter */ ! re_syntax_table[a] = Sword | Sletter; for (a = 'A'; a <= 'Z'; a++) ! re_syntax_table[a] = Sword | Sletter; ! ! for (a = '0'; a <= '9'; a++) /* P - added Sdigit and Shexdigit */ ! re_syntax_table[a] = Sword | Sdigit | Shexdigit; ! for (a = 'a'; a <= 'f'; a++) ! re_syntax_table[a] = Sword | Sletter | Shexdigit; ! for (a = 'A'; a <= 'F'; a++) ! re_syntax_table[a] = Sword | Sletter | Shexdigit; ! ! re_syntax_table[' '] = Swhitespace; /* P - added these entries */ ! re_syntax_table['\t'] = Swhitespace; ! re_syntax_table['\r'] = Swhitespace; ! re_syntax_table['\n'] = Swhitespace; ! re_syntax_table['\f'] = Swhitespace; } + + /* P - The following entry is outside syntax_table_inited because we might */ + /* have to update it whenever the regex_syntax changes. */ + + if (regexp_syntax & RE_EXTRA_CLASSES) /* P - added this clause */ + re_syntax_table['_'] = Sword; + else + re_syntax_table['_'] = 0; #endif /* !emacs && !SYNTAX_TABLE */ + re_compile_initialized = 1; for (a = 0; a < 256; a++) { *************** *** 228,233 **** --- 261,279 ---- regexp_quoted_ops['`'] = Rbegbuf; regexp_quoted_ops['\''] = Rendbuf; } + if (regexp_syntax & RE_EXTRA_CLASSES) /* P - added this clause */ + { + regexp_quoted_ops['s'] = Rwhitespacechar; + regexp_quoted_ops['S'] = Rnotwhitespacechar; + regexp_quoted_ops['w'] = Rwordchar; + regexp_quoted_ops['W'] = Rnotwordchar; + regexp_quoted_ops['d'] = Rdigitchar; + regexp_quoted_ops['D'] = Rnotdigitchar; + regexp_quoted_ops['h'] = Rhexdigitchar; /* not part of Perl, */ + regexp_quoted_ops['H'] = Rnothexdigitchar; /* but convenient */ + regexp_quoted_ops['l'] = Rletterchar; /* not part of Perl, */ + regexp_quoted_ops['L'] = Rnotletterchar; /* but convenient */ + } if (regexp_syntax & RE_ANSI_HEX) regexp_quoted_ops['v'] = Rextended_memory; for (a = 0; a < Rnum_ops; a++) *************** *** 248,253 **** --- 294,302 ---- regexp_precedences[Rend] = 0; regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0; regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0; + + /* P - added this flag */ + regexp_minimal_ops = (regexp_syntax & RE_MINIMAL_OPS) != 0; } int re_set_syntax(syntax) *************** *** 524,529 **** --- 573,593 ---- goto op_error; else goto normal_char; + + if (regexp_minimal_ops && pos < size && regex[pos] == '?') + { /* P - added this clause for minimal ? */ + pos++; + if (CURRENT_LEVEL_START == pattern_offset) + break; /* P - ignore empty patterns for ?? */ + + ALLOC(6); + INSERT_JUMP(CURRENT_LEVEL_START, Cjump, + pattern_offset + 3); + INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, + CURRENT_LEVEL_START + 6); + break; + } + if (CURRENT_LEVEL_START == pattern_offset) break; /* ignore empty patterns for ? */ ALLOC(3); *************** *** 537,542 **** --- 601,631 ---- goto op_error; else goto normal_char; + + if (regexp_minimal_ops && pos < size && regex[pos] == '?') + { /* P - added this clause for minimal * and + */ + pos++; + if (CURRENT_LEVEL_START == pattern_offset) + break; /* P - ignore empty patterns for *? and +? */ + + if (op == Rstar) + { + ALLOC(6); + INSERT_JUMP(CURRENT_LEVEL_START, Cjump, + pattern_offset + 3); + INSERT_JUMP(pattern_offset, Cfailure_jump, + CURRENT_LEVEL_START + 3); + } + else /* op == Rplus */ + { + ALLOC(3); + INSERT_JUMP(pattern_offset, Cfailure_jump, + CURRENT_LEVEL_START); + } + + break; + } + if (CURRENT_LEVEL_START == pattern_offset) break; /* ignore empty patterns for + and * */ ALLOC(9); *************** *** 686,691 **** --- 775,812 ---- opcode = Cnotsyntaxspec; ch = Sword; goto store_opcode_and_arg; + case Rdigitchar: /* P - added this case */ + opcode = Csyntaxspec; + ch = Sdigit; + goto store_opcode_and_arg; + case Rnotdigitchar: /* P - added this case */ + opcode = Cnotsyntaxspec; + ch = Sdigit; + goto store_opcode_and_arg; + case Rhexdigitchar: /* P - added this case */ + opcode = Csyntaxspec; + ch = Shexdigit; + goto store_opcode_and_arg; + case Rnothexdigitchar: /* P - added this case */ + opcode = Cnotsyntaxspec; + ch = Shexdigit; + goto store_opcode_and_arg; + case Rwhitespacechar: /* P - added this case */ + opcode = Csyntaxspec; + ch = Swhitespace; + goto store_opcode_and_arg; + case Rnotwhitespacechar: /* P - added this case */ + opcode = Cnotsyntaxspec; + ch = Swhitespace; + goto store_opcode_and_arg; + case Rletterchar: /* P - added this case */ + opcode = Csyntaxspec; + ch = Sletter; + goto store_opcode_and_arg; + case Rnotletterchar: /* P - added this case */ + opcode = Cnotsyntaxspec; + ch = Sletter; + goto store_opcode_and_arg; case Rwordbeg: opcode = Cwordbeg; goto store_opcode; *************** *** 803,815 **** case Csyntaxspec: syntaxcode = code[pos++]; for (a = 0; a < 256; a++) ! if (SYNTAX(a) == syntaxcode) fastmap[a] = 1; return; case Cnotsyntaxspec: syntaxcode = code[pos++]; for (a = 0; a < 256; a++) ! if (SYNTAX(a) != syntaxcode) fastmap[a] = 1; return; case Ceol: --- 924,936 ---- case Csyntaxspec: syntaxcode = code[pos++]; for (a = 0; a < 256; a++) ! if (SYNTAX(a) & syntaxcode) /* P - changed == to & */ fastmap[a] = 1; return; case Cnotsyntaxspec: syntaxcode = code[pos++]; for (a = 0; a < 256; a++) ! if (!(SYNTAX(a) & syntaxcode)) /* P - changed == to & */ fastmap[a] = 1; return; case Ceol: *************** *** 1337,1350 **** break; case Csyntaxspec: NEXTCHAR(ch); ! if (SYNTAX(ch) != (unsigned char)*code++) ! goto fail; ! break; ! case Cnotsyntaxspec: ! NEXTCHAR(ch); ! if (SYNTAX(ch) != (unsigned char)*code++) break; goto fail; #ifdef emacs case Cemacs_at_dot: if (PTR_CHAR_POS((unsigned char *)text) + 1 != point) --- 1458,1471 ---- break; case Csyntaxspec: NEXTCHAR(ch); ! if (SYNTAX(ch) & ((unsigned char)*code++)) /* P - changed == to & */ break; goto fail; + case Cnotsyntaxspec: + NEXTCHAR(ch); + if (SYNTAX(ch) & ((unsigned char)*code++)) /* P - changed == to & */ + goto fail; + break; #ifdef emacs case Cemacs_at_dot: if (PTR_CHAR_POS((unsigned char *)text) + 1 != point) *** Modules/regexpr.h Sat May 25 05:51:22 1996 --- Modules/regexpr.h Sun Sep 15 08:34:04 1996 *************** *** 55,63 **** --- 55,66 ---- #define RE_CONTEXT_INDEP_OPS 32 /* ^$?*+ are special in all contexts */ #define RE_ANSI_HEX 64 /* ansi sequences (\n etc) and \xhh */ #define RE_NO_GNU_EXTENSIONS 128 /* no gnu extensions */ + #define RE_EXTRA_CLASSES 256 /* \w\W\d\D\s\S\h\H\a\A char classes */ + #define RE_MINIMAL_OPS 512 /* allow minimal operators ??, +?, *? */ /* definitions for some common regexp styles */ #define RE_SYNTAX_AWK (RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_CONTEXT_INDEP_OPS) + #define RE_SYNTAX_PERLISH (RE_SYNTAX_AWK|RE_EXTRA_CLASSES|RE_MINIMAL_OPS) #define RE_SYNTAX_EGREP (RE_SYNTAX_AWK|RE_NEWLINE_OR) #define RE_SYNTAX_GREP (RE_BK_PLUS_QM|RE_NEWLINE_OR) #define RE_SYNTAX_EMACS 0