12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585 |
- /*++
- Copyright (c) 2013 Minoca Corp.
- This file is licensed under the terms of the GNU General Public License
- version 3. Alternative licensing terms are available. Contact
- info@minocacorp.com for details. See the LICENSE file at the root of this
- project for complete licensing information.
- Module Name:
- regexcmp.c
- Abstract:
- This module implements support for compiling regular expressions.
- Author:
- Evan Green 8-Jul-2013
- Environment:
- User Mode C Library
- --*/
- //
- // ------------------------------------------------------------------- Includes
- //
- #define LIBC_API __DLLEXPORT
- #include <minoca/lib/types.h>
- #include <assert.h>
- #include <ctype.h>
- #include <regex.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include "regexp.h"
- //
- // ---------------------------------------------------------------- Definitions
- //
- //
- // Define the maximum size of a string returned by regerror.
- //
- #define REGULAR_EXPRESSION_ERROR_STRING_MAX_SIZE 512
- //
- // Define regular expression tokens.
- //
- #define TOKEN_ESCAPED_OPEN_PARENTHESES 512
- #define TOKEN_ESCAPED_CLOSE_PARENTHESES 513
- #define TOKEN_ESCAPED_OPEN_BRACE 514
- #define TOKEN_ESCAPED_CLOSE_BRACE 515
- #define TOKEN_QUOTED_CHARACTER 516
- #define TOKEN_BACK_REFERENCE 517
- //
- // Define bracket expression tokens.
- //
- #define TOKEN_OPEN_EQUAL 550
- #define TOKEN_EQUAL_CLOSE 551
- #define TOKEN_OPEN_DOT 552
- #define TOKEN_DOT_CLOSE 553
- #define TOKEN_OPEN_COLON 554
- #define TOKEN_COLON_CLOSE 555
- //
- // Define the initial buffer size for regular expression strings.
- //
- #define REGULAR_EXPRESSION_INITIAL_STRING_SIZE 16
- //
- // ------------------------------------------------------ Data Type Definitions
- //
- /*++
- Structure Description:
- This structure defines the lexer state for the regular expression compiler.
- Members:
- Input - Stores the input string.
- InputSize - Stores the size of the input string in bytes including the
- null terminator.
- NextInput - Stores the offset of the next character to grab.
- Token - Stores the current token.
- ActiveSubexpressionCount - Stores the number of subexpressions currently
- being parsed (the number of close parentheses to treat as special
- characters).
- --*/
- typedef struct _REGULAR_EXPRESSION_LEXER {
- PSTR Input;
- ULONG InputSize;
- ULONG NextInput;
- ULONG Token;
- ULONG ActiveSubexpressionCount;
- } REGULAR_EXPRESSION_LEXER, *PREGULAR_EXPRESSION_LEXER;
- //
- // ----------------------------------------------- Internal Function Prototypes
- //
- REGULAR_EXPRESSION_STATUS
- ClpCompileRegularExpression (
- PSTR Pattern,
- ULONG Flags,
- PREGULAR_EXPRESSION *Expression
- );
- VOID
- ClpDestroyRegularExpression (
- PREGULAR_EXPRESSION Expression
- );
- REGULAR_EXPRESSION_STATUS
- ClpParseCompleteBasicRegularExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression
- );
- REGULAR_EXPRESSION_STATUS
- ClpParseBasicRegularExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY ParentEntry
- );
- REGULAR_EXPRESSION_STATUS
- ClpParseSimpleExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY *Entry
- );
- REGULAR_EXPRESSION_STATUS
- ClpParseExtendedRegularExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY ParentEntry
- );
- REGULAR_EXPRESSION_STATUS
- ClpParseExtendedRegularExpressionBranch (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY ParentEntry
- );
- REGULAR_EXPRESSION_STATUS
- ClpParseExtendedExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY *Entry
- );
- REGULAR_EXPRESSION_STATUS
- ClpParseBracketExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY Entry
- );
- REGULAR_EXPRESSION_STATUS
- ClpParseRegularExpressionDuplicationSymbol (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY Entry
- );
- REGULAR_EXPRESSION_STATUS
- ClpParseRegularExpressionDuplicationCount (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY Entry
- );
- REGULAR_EXPRESSION_STATUS
- ClpGetRegularExpressionToken (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression
- );
- REGULAR_EXPRESSION_STATUS
- ClpGetBracketExpressionToken (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression
- );
- PREGULAR_EXPRESSION_ENTRY
- ClpCreateRegularExpressionEntry (
- REGEX_ENTRY_TYPE Type
- );
- VOID
- ClpDestroyRegularExpressionEntry (
- PREGULAR_EXPRESSION_ENTRY Entry
- );
- BOOL
- ClpAppendRegularExpressionString (
- PREGULAR_EXPRESSION_STRING String,
- PSTR Data,
- ULONG Size
- );
- //
- // -------------------------------------------------------------------- Globals
- //
- //
- // ------------------------------------------------------------------ Functions
- //
- LIBC_API
- int
- regcomp (
- regex_t *RegularExpression,
- const char *Pattern,
- int Flags
- )
- /*++
- Routine Description:
- This routine compiles a regular expression.
- Arguments:
- RegularExpression - Supplies a pointer to the regular expression structure
- where the compiled form will reside on success.
- Pattern - Supplies a pointer to the pattern input string.
- Flags - Supplies a bitfield of flags governing the behavior of the regular
- expression. See some REG_* definitions.
- Return Value:
- 0 on success.
- Returns a REG_* status code on failure.
- --*/
- {
- PREGULAR_EXPRESSION CompiledExpression;
- REGULAR_EXPRESSION_STATUS Status;
- RegularExpression->re_nsub = 0;
- RegularExpression->re_data = NULL;
- Status = ClpCompileRegularExpression((PSTR)Pattern,
- Flags,
- &CompiledExpression);
- if (Status != RegexStatusSuccess) {
- return Status;
- }
- RegularExpression->re_nsub = CompiledExpression->SubexpressionCount;
- RegularExpression->re_data = CompiledExpression;
- return 0;
- }
- LIBC_API
- void
- regfree (
- regex_t *RegularExpression
- )
- /*++
- Routine Description:
- This routine destroys and frees all resources associated with a compiled
- regular expression.
- Arguments:
- RegularExpression - Supplies a pointer to the regular expression structure
- to destroy. The caller owns the structure itself, this routine just
- guts all the innards.
- Return Value:
- None.
- --*/
- {
- if (RegularExpression == NULL) {
- return;
- }
- ClpDestroyRegularExpression(RegularExpression->re_data);
- RegularExpression->re_data = NULL;
- RegularExpression->re_nsub = 0;
- return;
- }
- LIBC_API
- size_t
- regerror (
- int ErrorCode,
- const regex_t *Expression,
- char *Buffer,
- size_t BufferSize
- )
- /*++
- Routine Description:
- This routine returns error information about what went wrong trying to
- compile the regular expression.
- Arguments:
- ErrorCode - Supplies the error code returned from a regular expression
- token.
- Expression - Supplies an optional pointer to the expression.
- Buffer - Supplies a pointer to a buffer where the error string will be
- returned, always null terminated.
- BufferSize - Supplies the size of the buffer in bytes.
- Return Value:
- Returns the number of bytes needed to hold the entire error string,
- including the null terminator. If the return value is greater than the
- supplied size, then the buffer will be truncated and null terminated.
- --*/
- {
- int BytesPrinted;
- PSTR ErrorString;
- switch (ErrorCode) {
- case 0:
- ErrorString = "No error";
- break;
- case REG_NOMATCH:
- ErrorString = "No match";
- break;
- case REG_BADPAT:
- ErrorString = "Bad pattern";
- break;
- case REG_ECOLLATE:
- ErrorString = "Invalid collating element";
- break;
- case REG_ECTYPE:
- ErrorString = "Invalid character class";
- break;
- case REG_EESCAPE:
- ErrorString = "Dangling escape character";
- break;
- case REG_ESUBREG:
- ErrorString = "Invalid subexpression";
- break;
- case REG_EBRACK:
- ErrorString = "Square bracket imbalance";
- break;
- case REG_EPAREN:
- ErrorString = "Parentheses imbalance";
- break;
- case REG_BADBR:
- ErrorString = "Invalid curly braces";
- break;
- case REG_ERANGE:
- ErrorString = "Invalid range expression";
- break;
- case REG_ESPACE:
- ErrorString = "Out of memory";
- break;
- case REG_BADRPT:
- ErrorString = "Bad repeat expression";
- break;
- default:
- ErrorString = "Unknown error";
- break;
- }
- BytesPrinted = snprintf(Buffer, BufferSize, "%s", ErrorString);
- if (BytesPrinted < 0) {
- BytesPrinted = REGULAR_EXPRESSION_ERROR_STRING_MAX_SIZE;
- if ((Buffer != NULL) && (BufferSize != 0)) {
- *Buffer = '\0';
- }
- } else if (BytesPrinted == BufferSize) {
- Buffer[BufferSize - 1] = '\0';
- BytesPrinted = REGULAR_EXPRESSION_ERROR_STRING_MAX_SIZE;
- } else {
- //
- // Include the null terminator.
- //
- BytesPrinted += 1;
- }
- return BytesPrinted;
- }
- //
- // --------------------------------------------------------- Internal Functions
- //
- REGULAR_EXPRESSION_STATUS
- ClpCompileRegularExpression (
- PSTR Pattern,
- ULONG Flags,
- PREGULAR_EXPRESSION *Expression
- )
- /*++
- Routine Description:
- This routine compiles a regular expression.
- Arguments:
- Pattern - Supplies a pointer to the pattern input string.
- Flags - Supplies a bitfield of flags governing the behavior of the regular
- expression. See some REG_* definitions.
- Expression - Supplies a pointer where a pointer to the regular expression
- will be returned on success. The caller is responsible for calling the
- appropriate destroy routine for this structure.
- Return Value:
- Regular expression status code.
- --*/
- {
- REGULAR_EXPRESSION_LEXER Lexer;
- PREGULAR_EXPRESSION Result;
- REGULAR_EXPRESSION_STATUS Status;
- //
- // Allocate and initialize the basic structures.
- //
- Result = malloc(sizeof(REGULAR_EXPRESSION));
- if (Result == NULL) {
- Status = RegexStatusNoMemory;
- goto CompileRegularExpressionEnd;
- }
- memset(Result, 0, sizeof(REGULAR_EXPRESSION));
- Result->BaseEntry.Type = RegexEntrySubexpression;
- INITIALIZE_LIST_HEAD(&(Result->BaseEntry.ChildList));
- Result->BaseEntry.DuplicateMin = 1;
- Result->BaseEntry.DuplicateMax = 1;
- Result->Flags = Flags;
- memset(&Lexer, 0, sizeof(REGULAR_EXPRESSION_LEXER));
- Lexer.Input = Pattern;
- Lexer.InputSize = strlen(Pattern) + 1;
- //
- // Prime the lexer.
- //
- Status = ClpGetRegularExpressionToken(&Lexer, Result);
- if (Status != RegexStatusSuccess) {
- goto CompileRegularExpressionEnd;
- }
- if ((Flags & REG_EXTENDED) != 0) {
- Status = ClpParseExtendedRegularExpression(&Lexer,
- Result,
- &(Result->BaseEntry));
- } else {
- Status = ClpParseCompleteBasicRegularExpression(&Lexer, Result);
- }
- if (Status != RegexStatusSuccess) {
- goto CompileRegularExpressionEnd;
- }
- //
- // Fail if this isn't the end.
- //
- if (Lexer.Token != '\0') {
- Status = RegexStatusBadPattern;
- goto CompileRegularExpressionEnd;
- }
- CompileRegularExpressionEnd:
- if (Status != RegexStatusSuccess) {
- if (Result != NULL) {
- ClpDestroyRegularExpression(Result);
- Result = NULL;
- }
- }
- *Expression = Result;
- return Status;
- }
- VOID
- ClpDestroyRegularExpression (
- PREGULAR_EXPRESSION Expression
- )
- /*++
- Routine Description:
- This routine destroys a regular expression structure.
- Arguments:
- Expression - Supplies a pointer to the regular expression structure to
- destroy.
- Return Value:
- None.
- --*/
- {
- PREGULAR_EXPRESSION_ENTRY Entry;
- if (Expression == NULL) {
- return;
- }
- while (LIST_EMPTY(&(Expression->BaseEntry.ChildList)) == FALSE) {
- Entry = LIST_VALUE(Expression->BaseEntry.ChildList.Next,
- REGULAR_EXPRESSION_ENTRY,
- ListEntry);
- LIST_REMOVE(&(Entry->ListEntry));
- ClpDestroyRegularExpressionEntry(Entry);
- }
- free(Expression);
- return;
- }
- REGULAR_EXPRESSION_STATUS
- ClpParseCompleteBasicRegularExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression
- )
- /*++
- Routine Description:
- This routine compiles a basic regular expression.
- Arguments:
- Lexer - Supplies a pointer to the initialized lexer.
- Expression - Supplies the expression to compile on.
- Return Value:
- Regular expression status code.
- --*/
- {
- ULONG EntryFlags;
- REGULAR_EXPRESSION_STATUS Status;
- EntryFlags = 0;
- //
- // Parse an optional left anchor (^).
- //
- if (Lexer->Token == '^') {
- EntryFlags |= REGULAR_EXPRESSION_ANCHORED_LEFT;
- Status = ClpGetRegularExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseBasicRegularExpressionEnd;
- }
- }
- //
- // Parse an expression.
- //
- Status = ClpParseBasicRegularExpression(Lexer,
- Expression,
- &(Expression->BaseEntry));
- if (Status != RegexStatusSuccess) {
- goto ParseBasicRegularExpressionEnd;
- }
- //
- // Parse an optional right anchor.
- //
- if (Lexer->Token == '$') {
- EntryFlags |= REGULAR_EXPRESSION_ANCHORED_RIGHT;
- Status = ClpGetRegularExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseBasicRegularExpressionEnd;
- }
- }
- Expression->BaseEntry.Flags = EntryFlags;
- ParseBasicRegularExpressionEnd:
- return Status;
- }
- REGULAR_EXPRESSION_STATUS
- ClpParseBasicRegularExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY ParentEntry
- )
- /*++
- Routine Description:
- This routine parses a de-anchored expression for a basic regular
- expression.
- Arguments:
- Lexer - Supplies a pointer to the initialized lexer.
- Expression - Supplies the expression to compile on.
- ParentEntry - Supplies a pointer to the regular expression entry this entry
- belongs under.
- Return Value:
- Regular expression status code.
- --*/
- {
- PREGULAR_EXPRESSION_ENTRY Entry;
- PREGULAR_EXPRESSION_ENTRY PreviousEntry;
- BOOL Result;
- REGULAR_EXPRESSION_STATUS Status;
- Entry = NULL;
- PreviousEntry = NULL;
- //
- // Loop parsing simple expressions and duplication symbols.
- //
- while (TRUE) {
- Status = ClpParseSimpleExpression(Lexer, Expression, &Entry);
- if (Status != RegexStatusSuccess) {
- goto ParseRegularExpressionEnd;
- }
- if (Entry == NULL) {
- //
- // Don't allow repeat symbols coming up if there was no entry,
- // which could happen if the first input is a repeat character.
- //
- if ((Lexer->Token == '*') ||
- (Lexer->Token == TOKEN_ESCAPED_OPEN_BRACE)) {
- Status = RegexStatusInvalidRepeat;
- }
- break;
- }
- Status = ClpParseRegularExpressionDuplicationSymbol(Lexer,
- Expression,
- Entry);
- if (Status != RegexStatusSuccess) {
- goto ParseRegularExpressionEnd;
- }
- //
- // Here's a little optimization. If this entry is just an ordinary
- // character and the last one is too, combine them into one entry to
- // avoid a gigantic chain of single character expression entries.
- // Watch out not to do this if either one had duplicate symbols on it
- // (ie "f*a" can't just be combined to "fa").
- //
- if ((Entry->Type == RegexEntryOrdinaryCharacters) &&
- (Entry->DuplicateMin == 1) && (Entry->DuplicateMax == 1) &&
- (PreviousEntry != NULL) &&
- (PreviousEntry->Type == RegexEntryOrdinaryCharacters) &&
- (PreviousEntry->DuplicateMin == 1) &&
- (PreviousEntry->DuplicateMax == 1)) {
- Result = ClpAppendRegularExpressionString(
- &(PreviousEntry->U.String),
- Entry->U.String.Data,
- Entry->U.String.Size);
- if (Result == FALSE) {
- Status = RegexStatusNoMemory;
- goto ParseRegularExpressionEnd;
- }
- ClpDestroyRegularExpressionEntry(Entry);
- Entry = NULL;
- continue;
- }
- //
- // Add this expression entry to the parent.
- //
- Entry->Parent = ParentEntry;
- INSERT_BEFORE(&(Entry->ListEntry), &(ParentEntry->ChildList));
- PreviousEntry = Entry;
- Entry = NULL;
- }
- ParseRegularExpressionEnd:
- if (Entry != NULL) {
- ClpDestroyRegularExpressionEntry(Entry);
- }
- return Status;
- }
- REGULAR_EXPRESSION_STATUS
- ClpParseSimpleExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY *Entry
- )
- /*++
- Routine Description:
- This routine parses a simple expression for a basic regular expression.
- Arguments:
- Lexer - Supplies a pointer to the initialized lexer.
- Expression - Supplies the expression to compile on.
- Entry - Supplies a pointer where the regular expression entry will be
- returned on success.
- Return Value:
- Regular expression status code.
- --*/
- {
- CHAR Character;
- PREGULAR_EXPRESSION_ENTRY NewEntry;
- BOOL Result;
- REGULAR_EXPRESSION_STATUS Status;
- Status = RegexStatusSuccess;
- NewEntry = ClpCreateRegularExpressionEntry(RegexEntryInvalid);
- if (NewEntry == NULL) {
- Status = RegexStatusNoMemory;
- goto ParseSimpleExpressionEnd;
- }
- switch (Lexer->Token) {
- case '.':
- NewEntry->Type = RegexEntryAnyCharacter;
- break;
- case '[':
- Status = ClpParseBracketExpression(Lexer, Expression, NewEntry);
- if (Status != RegexStatusSuccess) {
- goto ParseSimpleExpressionEnd;
- }
- break;
- //
- // Parse a subexpression.
- //
- case TOKEN_ESCAPED_OPEN_PARENTHESES:
- NewEntry->Type = RegexEntrySubexpression;
- //
- // Zoom past the open parentheses.
- //
- Status = ClpGetRegularExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseSimpleExpressionEnd;
- }
- //
- // Take a subexpression number and parse a subexpression.
- //
- Expression->SubexpressionCount += 1;
- NewEntry->U.SubexpressionNumber = Expression->SubexpressionCount;
- Status = ClpParseBasicRegularExpression(Lexer,
- Expression,
- NewEntry);
- if (Status != RegexStatusSuccess) {
- goto ParseSimpleExpressionEnd;
- }
- //
- // Get the close parentheses and swallow it.
- //
- if (Lexer->Token != TOKEN_ESCAPED_CLOSE_PARENTHESES) {
- Status = RegexStatusParenthesesImbalance;
- goto ParseSimpleExpressionEnd;
- }
- break;
- //
- // Parse a quoted character like a normal character.
- //
- case TOKEN_QUOTED_CHARACTER:
- NewEntry->Type = RegexEntryOrdinaryCharacters;
- assert(Lexer->NextInput != 0);
- Character = Lexer->Input[Lexer->NextInput - 1];
- Result = ClpAppendRegularExpressionString(&(NewEntry->U.String),
- &Character,
- 1);
- if (Result == FALSE) {
- goto ParseSimpleExpressionEnd;
- }
- break;
- //
- // Parse a back reference.
- //
- case TOKEN_BACK_REFERENCE:
- NewEntry->Type = RegexEntryBackReference;
- assert(Lexer->NextInput != 0);
- Character = Lexer->Input[Lexer->NextInput - 1];
- //
- // The lexer shouldn't be returning the back reference token unless
- // it's valid.
- //
- assert((Character >= '1') && (Character <= '9'));
- NewEntry->U.BackReferenceNumber = Character - '0';
- if (NewEntry->U.BackReferenceNumber > Expression->SubexpressionCount) {
- Status = RegexStatusInvalidSubexpression;
- goto ParseSimpleExpressionEnd;
- }
- break;
- //
- // Some items are not simple expression entries by themselves. As an oddity,
- // one of these characters at the beginning of a basic regular expression
- // is considered a normal character.
- //
- case '*':
- case TOKEN_ESCAPED_CLOSE_PARENTHESES:
- case TOKEN_ESCAPED_OPEN_BRACE:
- case TOKEN_ESCAPED_CLOSE_BRACE:
- case '\0':
- if ((Lexer->Token == '\0') || (Lexer->NextInput != 1)) {
- ClpDestroyRegularExpressionEntry(NewEntry);
- NewEntry = NULL;
- goto ParseSimpleExpressionEnd;
- }
- //
- // Fall through.
- //
- //
- // This must be an ordinary character.
- //
- default:
- assert(Lexer->Token < MAX_UCHAR);
- NewEntry->Type = RegexEntryOrdinaryCharacters;
- Character = Lexer->Token;
- //
- // Watch out for a dollar sign at the end, which is actually an anchor.
- //
- if ((Character == '$') &&
- ((Lexer->NextInput >= Lexer->InputSize) ||
- (Lexer->Input[Lexer->NextInput] == '\0'))) {
- ClpDestroyRegularExpressionEntry(NewEntry);
- NewEntry = NULL;
- goto ParseSimpleExpressionEnd;
- }
- Result = ClpAppendRegularExpressionString(&(NewEntry->U.String),
- &Character,
- 1);
- if (Result == FALSE) {
- goto ParseSimpleExpressionEnd;
- }
- break;
- }
- //
- // Swallow the token that has just been dealt with.
- //
- Status = ClpGetRegularExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseSimpleExpressionEnd;
- }
- ParseSimpleExpressionEnd:
- if (Status != RegexStatusSuccess) {
- if (NewEntry != NULL) {
- ClpDestroyRegularExpressionEntry(NewEntry);
- NewEntry = NULL;
- }
- }
- *Entry = NewEntry;
- return Status;
- }
- REGULAR_EXPRESSION_STATUS
- ClpParseExtendedRegularExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY ParentEntry
- )
- /*++
- Routine Description:
- This routine parses an extended regular expression.
- Arguments:
- Lexer - Supplies a pointer to the initialized lexer.
- Expression - Supplies the expression to compile on.
- ParentEntry - Supplies a pointer to the parent any new expression entries
- should be placed under.
- Return Value:
- Regular expression status code.
- --*/
- {
- PREGULAR_EXPRESSION_ENTRY Branch;
- PREGULAR_EXPRESSION_ENTRY Child;
- PREGULAR_EXPRESSION_ENTRY Entry;
- REGULAR_EXPRESSION_STATUS Status;
- Entry = NULL;
- //
- // Create the branch umbrella.
- //
- Branch = ClpCreateRegularExpressionEntry(RegexEntryBranch);
- if (Branch == NULL) {
- Status = RegexStatusNoMemory;
- goto ParseExtendedRegularExpressionEnd;
- }
- //
- // Loop creating the branch options.
- //
- while (TRUE) {
- //
- // Create a branch entry to contain the upcoming expression.
- //
- Entry = ClpCreateRegularExpressionEntry(RegexEntryBranchOption);
- if (Entry == NULL) {
- Status = RegexStatusNoMemory;
- goto ParseExtendedRegularExpressionEnd;
- }
- //
- // Parse out the contents of the branch.
- //
- Status = ClpParseExtendedRegularExpressionBranch(Lexer,
- Expression,
- Entry);
- if (Status != RegexStatusSuccess) {
- goto ParseExtendedRegularExpressionEnd;
- }
- //
- // Add this branch to the parent.
- //
- Entry->Parent = Branch;
- INSERT_BEFORE(&(Entry->ListEntry), &(Branch->ChildList));
- Entry = NULL;
- //
- // Stop if there's not more.
- //
- if (Lexer->Token != '|') {
- break;
- }
- //
- // Get past that pipe and go around again.
- //
- Status = ClpGetRegularExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseExtendedRegularExpressionEnd;
- }
- }
- //
- // If there's only one branch option, pull all the children off of the
- // branch option and stick them on the parent. Then the branch and branch
- // option entries can be destroyed.
- //
- assert(LIST_EMPTY(&(Branch->ChildList)) == FALSE);
- if (Branch->ChildList.Next->Next == &(Branch->ChildList)) {
- Entry = LIST_VALUE(Branch->ChildList.Next,
- REGULAR_EXPRESSION_ENTRY,
- ListEntry);
- while (LIST_EMPTY(&(Entry->ChildList)) == FALSE) {
- Child = LIST_VALUE(Entry->ChildList.Next,
- REGULAR_EXPRESSION_ENTRY,
- ListEntry);
- LIST_REMOVE(&(Child->ListEntry));
- INSERT_BEFORE(&(Child->ListEntry), &(ParentEntry->ChildList));
- Child->Parent = ParentEntry;
- }
- ClpDestroyRegularExpressionEntry(Branch);
- Entry = NULL;
- Branch = NULL;
- //
- // There are multiple branch options, so put the branch entry on the parent.
- //
- } else {
- Branch->Parent = ParentEntry;
- INSERT_BEFORE(&(Branch->ListEntry), &(ParentEntry->ChildList));
- }
- ParseExtendedRegularExpressionEnd:
- if (Status != RegexStatusSuccess) {
- if (Branch != NULL) {
- ClpDestroyRegularExpressionEntry(Branch);
- }
- }
- if (Entry != NULL) {
- ClpDestroyRegularExpressionEntry(Entry);
- }
- return Status;
- }
- REGULAR_EXPRESSION_STATUS
- ClpParseExtendedRegularExpressionBranch (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY ParentEntry
- )
- /*++
- Routine Description:
- This routine parses a single extended regular expression branch (ie the
- expression "a|b|c" has three branches, this routine parses just one
- of them).
- Arguments:
- Lexer - Supplies a pointer to the initialized lexer.
- Expression - Supplies the expression to compile on.
- ParentEntry - Supplies a pointer to the parent any new expression entries
- should be placed under.
- Return Value:
- Regular expression status code.
- --*/
- {
- PREGULAR_EXPRESSION_ENTRY Entry;
- PREGULAR_EXPRESSION_ENTRY PreviousEntry;
- BOOL Result;
- REGULAR_EXPRESSION_STATUS Status;
- Entry = NULL;
- PreviousEntry = NULL;
- //
- // Loop parsing simple expressions and duplication symbols.
- //
- while (TRUE) {
- Status = ClpParseExtendedExpression(Lexer, Expression, &Entry);
- if (Status != RegexStatusSuccess) {
- goto ParseRegularExpressionEnd;
- }
- if (Entry == NULL) {
- //
- // Don't allow repeat symbols coming up if there was no entry,
- // which could happen if the first input is a repeat character.
- //
- if ((Lexer->Token == '*') || (Lexer->Token == '+') ||
- (Lexer->Token == '?') || (Lexer->Token == '{')) {
- Status = RegexStatusInvalidRepeat;
- }
- break;
- }
- Status = ClpParseRegularExpressionDuplicationSymbol(Lexer,
- Expression,
- Entry);
- if (Status != RegexStatusSuccess) {
- goto ParseRegularExpressionEnd;
- }
- //
- // Here's a little optimization. If this entry is just an ordinary
- // character and the last one is too, combine them into one entry to
- // avoid a gigantic chain of single character expression entries.
- // Watch out not to do this if either one had duplicate symbols on it
- // (ie "f*a" can't just be combined to "fa").
- //
- if ((Entry->Type == RegexEntryOrdinaryCharacters) &&
- (Entry->DuplicateMin == 1) && (Entry->DuplicateMax == 1) &&
- (PreviousEntry != NULL) &&
- (PreviousEntry->Type == RegexEntryOrdinaryCharacters) &&
- (PreviousEntry->DuplicateMin == 1) &&
- (PreviousEntry->DuplicateMax == 1)) {
- Result = ClpAppendRegularExpressionString(
- &(PreviousEntry->U.String),
- Entry->U.String.Data,
- Entry->U.String.Size);
- if (Result == FALSE) {
- Status = RegexStatusNoMemory;
- goto ParseRegularExpressionEnd;
- }
- ClpDestroyRegularExpressionEntry(Entry);
- Entry = NULL;
- continue;
- }
- //
- // Add this expression entry to the parent.
- //
- Entry->Parent = ParentEntry;
- INSERT_BEFORE(&(Entry->ListEntry), &(ParentEntry->ChildList));
- PreviousEntry = Entry;
- Entry = NULL;
- }
- ParseRegularExpressionEnd:
- if (Entry != NULL) {
- ClpDestroyRegularExpressionEntry(Entry);
- }
- return Status;
- }
- REGULAR_EXPRESSION_STATUS
- ClpParseExtendedExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY *Entry
- )
- /*++
- Routine Description:
- This routine parses a base expression ("ERE_expression" as described in the
- specs) for an extended regular expression.
- Arguments:
- Lexer - Supplies a pointer to the initialized lexer.
- Expression - Supplies the expression to compile on.
- Entry - Supplies a pointer where the regular expression entry will be
- returned on success.
- Return Value:
- Regular expression status code.
- --*/
- {
- CHAR Character;
- PREGULAR_EXPRESSION_ENTRY NewEntry;
- BOOL Result;
- REGULAR_EXPRESSION_STATUS Status;
- Status = RegexStatusSuccess;
- NewEntry = ClpCreateRegularExpressionEntry(RegexEntryInvalid);
- if (NewEntry == NULL) {
- Status = RegexStatusNoMemory;
- goto ParseExtendedExpressionEnd;
- }
- switch (Lexer->Token) {
- case '.':
- NewEntry->Type = RegexEntryAnyCharacter;
- break;
- case '^':
- NewEntry->Type = RegexEntryStringBegin;
- break;
- case '$':
- NewEntry->Type = RegexEntryStringEnd;
- break;
- case '[':
- Status = ClpParseBracketExpression(Lexer, Expression, NewEntry);
- if (Status != RegexStatusSuccess) {
- goto ParseExtendedExpressionEnd;
- }
- break;
- //
- // Parse a subexpression.
- //
- case '(':
- NewEntry->Type = RegexEntrySubexpression;
- //
- // Zoom past the open parentheses.
- //
- Status = ClpGetRegularExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseExtendedExpressionEnd;
- }
- //
- // Take a subexpression number and parse the subexpression.
- //
- Lexer->ActiveSubexpressionCount += 1;
- Expression->SubexpressionCount += 1;
- NewEntry->U.SubexpressionNumber = Expression->SubexpressionCount;
- Status = ClpParseExtendedRegularExpression(Lexer,
- Expression,
- NewEntry);
- if (Status != RegexStatusSuccess) {
- goto ParseExtendedExpressionEnd;
- }
- //
- // Verify the close parentheses.
- //
- if (Lexer->Token != ')') {
- Status = RegexStatusParenthesesImbalance;
- goto ParseExtendedExpressionEnd;
- }
- Lexer->ActiveSubexpressionCount -= 1;
- break;
- //
- // Parse a quoted character like a normal character.
- //
- case TOKEN_QUOTED_CHARACTER:
- NewEntry->Type = RegexEntryOrdinaryCharacters;
- assert(Lexer->NextInput != 0);
- Character = Lexer->Input[Lexer->NextInput - 1];
- Result = ClpAppendRegularExpressionString(&(NewEntry->U.String),
- &Character,
- 1);
- if (Result == FALSE) {
- goto ParseExtendedExpressionEnd;
- }
- break;
- //
- // Parse a back reference.
- //
- case TOKEN_BACK_REFERENCE:
- NewEntry->Type = RegexEntryBackReference;
- assert(Lexer->NextInput != 0);
- Character = Lexer->Input[Lexer->NextInput - 1];
- //
- // The lexer shouldn't be returning the back reference token unless
- // it's valid.
- //
- assert((Character >= '0') && (Character <= '9'));
- NewEntry->U.BackReferenceNumber = Character - '0';
- if (NewEntry->U.BackReferenceNumber > Expression->SubexpressionCount) {
- Status = RegexStatusInvalidSubexpression;
- goto ParseExtendedExpressionEnd;
- }
- break;
- //
- // Some items are not simple expression entries by themselves.
- //
- case '*':
- case '+':
- case '?':
- case '{':
- case '|':
- case '\0':
- ClpDestroyRegularExpressionEntry(NewEntry);
- NewEntry = NULL;
- goto ParseExtendedExpressionEnd;
- //
- // This must be an ordinary character.
- //
- default:
- assert(Lexer->Token < MAX_UCHAR);
- NewEntry->Type = RegexEntryOrdinaryCharacters;
- Character = Lexer->Token;
- //
- // Watch out for a close parentheses if there are active open ones.
- //
- if ((Character == ')') && (Lexer->ActiveSubexpressionCount != 0)) {
- ClpDestroyRegularExpressionEntry(NewEntry);
- NewEntry = NULL;
- goto ParseExtendedExpressionEnd;
- }
- Result = ClpAppendRegularExpressionString(&(NewEntry->U.String),
- &Character,
- 1);
- if (Result == FALSE) {
- goto ParseExtendedExpressionEnd;
- }
- break;
- }
- //
- // Swallow the token that has just been dealt with.
- //
- Status = ClpGetRegularExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseExtendedExpressionEnd;
- }
- ParseExtendedExpressionEnd:
- if (Status != RegexStatusSuccess) {
- if (NewEntry != NULL) {
- ClpDestroyRegularExpressionEntry(NewEntry);
- NewEntry = NULL;
- }
- }
- *Entry = NewEntry;
- return Status;
- }
- REGULAR_EXPRESSION_STATUS
- ClpParseBracketExpression (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY Entry
- )
- /*++
- Routine Description:
- This routine parses a bracket expression, which expresses a set of
- characters or collating elements that satisfy the expression.
- Arguments:
- Lexer - Supplies a pointer to the initialized lexer.
- Expression - Supplies the expression to compile on.
- Entry - Supplies a pointer to the regular expression bracket entry.
- Return Value:
- Regular expression status code.
- --*/
- {
- PREGULAR_BRACKET_ENTRY BracketEntry;
- BRACKET_EXPRESSION_TYPE BracketType;
- CHAR PreviousCharacter;
- PREGULAR_EXPRESSION_STRING RegularCharacters;
- BOOL Result;
- REGULAR_EXPRESSION_STATUS Status;
- BOOL Stop;
- PSTR String;
- assert(Lexer->Token == '[');
- assert(Lexer->NextInput != 0);
- //
- // See if this is a start-of-word or end-of-word, and not actually a
- // bracket expression.
- //
- String = Lexer->Input + Lexer->NextInput - 1;
- if (strncmp(String, "[[:<:]]", 7) == 0) {
- Entry->Type = RegexEntryStartOfWord;
- Lexer->Input += 6;
- Status = RegexStatusSuccess;
- goto ParseBracketExpressionEnd;
- } else if (strncmp(String, "[[:>:]]", 7) == 0) {
- Entry->Type = RegexEntryEndOfWord;
- Lexer->Input += 6;
- Status = RegexStatusSuccess;
- goto ParseBracketExpressionEnd;
- }
- Entry->Type = RegexEntryBracketExpression;
- INITIALIZE_LIST_HEAD(&(Entry->U.BracketExpression.EntryList));
- //
- // Swallow the open bracket.
- //
- Status = ClpGetBracketExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseBracketExpressionEnd;
- }
- RegularCharacters = &(Entry->U.BracketExpression.RegularCharacters);
- //
- // A circumflex negates the whole expression (ie matches on characters
- // *not* in this set.
- //
- if (Lexer->Token == '^') {
- Entry->Flags |= REGULAR_EXPRESSION_NEGATED;
- Status = ClpGetBracketExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseBracketExpressionEnd;
- }
- }
- //
- // A closing bracket or minus here is treated as an ordinary character.
- //
- if ((Lexer->Token == ']') || (Lexer->Token == '-')) {
- Result = ClpAppendRegularExpressionString(RegularCharacters,
- (PSTR)&(Lexer->Token),
- 1);
- if (Result == FALSE) {
- Status = RegexStatusNoMemory;
- goto ParseBracketExpressionEnd;
- }
- Status = ClpGetBracketExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseBracketExpressionEnd;
- }
- }
- //
- // Loop adding characters to this bracket expression.
- //
- PreviousCharacter = 0;
- Stop = FALSE;
- while (TRUE) {
- switch (Lexer->Token) {
- case TOKEN_OPEN_COLON:
- String = Lexer->Input + Lexer->NextInput;
- BracketType = BracketExpressionInvalid;
- if (strncmp(String, "alnum", 5) == 0) {
- BracketType = BracketExpressionCharacterClassAlphanumeric;
- Lexer->NextInput += 5;
- } else if (strncmp(String, "alpha", 5) == 0) {
- BracketType = BracketExpressionCharacterClassAlphabetic;
- Lexer->NextInput += 5;
- } else if (strncmp(String, "blank", 5) == 0) {
- BracketType = BracketExpressionCharacterClassBlank;
- Lexer->NextInput += 5;
- } else if (strncmp(String, "cntrl", 5) == 0) {
- BracketType = BracketExpressionCharacterClassControl;
- Lexer->NextInput += 5;
- } else if (strncmp(String, "digit", 5) == 0) {
- BracketType = BracketExpressionCharacterClassDigit;
- Lexer->NextInput += 5;
- } else if (strncmp(String, "graph", 5) == 0) {
- BracketType = BracketExpressionCharacterClassGraph;
- Lexer->NextInput += 5;
- } else if (strncmp(String, "lower", 5) == 0) {
- BracketType = BracketExpressionCharacterClassLowercase;
- Lexer->NextInput += 5;
- } else if (strncmp(String, "print", 5) == 0) {
- BracketType = BracketExpressionCharacterClassPrintable;
- Lexer->NextInput += 5;
- } else if (strncmp(String, "punct", 5) == 0) {
- BracketType = BracketExpressionCharacterClassPunctuation;
- Lexer->NextInput += 5;
- } else if (strncmp(String, "space", 5) == 0) {
- BracketType = BracketExpressionCharacterClassSpace;
- Lexer->NextInput += 5;
- } else if (strncmp(String, "upper", 5) == 0) {
- BracketType = BracketExpressionCharacterClassUppercase;
- Lexer->NextInput += 5;
- } else if (strncmp(String, "xdigit", 6) == 0) {
- BracketType = BracketExpressionCharacterClassHexDigit;
- Lexer->NextInput += 6;
- } else if (strncmp(String, "name", 4) == 0) {
- BracketType = BracketExpressionCharacterClassName;
- Lexer->NextInput += 5;
- }
- if (BracketType == BracketExpressionInvalid) {
- Status = RegexStatusBadCharacterClass;
- goto ParseBracketExpressionEnd;
- }
- BracketEntry = malloc(sizeof(REGULAR_BRACKET_ENTRY));
- if (BracketEntry == NULL) {
- Status = RegexStatusNoMemory;
- goto ParseBracketExpressionEnd;
- }
- memset(BracketEntry, 0, sizeof(REGULAR_BRACKET_ENTRY));
- BracketEntry->Type = BracketType;
- INSERT_BEFORE(&(BracketEntry->ListEntry),
- &(Entry->U.BracketExpression.EntryList));
- //
- // Swallow up the colon close.
- //
- Status = ClpGetBracketExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseBracketExpressionEnd;
- }
- if (Lexer->Token != ':') {
- Status = RegexStatusBadPattern;
- goto ParseBracketExpressionEnd;
- }
- Status = ClpGetBracketExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseBracketExpressionEnd;
- }
- if (Lexer->Token != ']') {
- Status = RegexStatusBadPattern;
- goto ParseBracketExpressionEnd;
- }
- break;
- case TOKEN_OPEN_DOT:
- printf("regex: Collating element support not implemented!\n");
- //
- // Spin until the dot close.
- //
- while ((PreviousCharacter != '.') || (Lexer->Token != ']')) {
- if (Lexer->Token == '\0') {
- Status = RegexStatusBracketImbalance;
- goto ParseBracketExpressionEnd;
- }
- PreviousCharacter = (UCHAR)Lexer->Token;
- Status = ClpGetBracketExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseBracketExpressionEnd;
- }
- }
- break;
- case TOKEN_OPEN_EQUAL:
- printf("regex: Equivalence class support not implemented!\n");
- //
- // Spin until the equal close.
- //
- while ((PreviousCharacter != '=') || (Lexer->Token != ']')) {
- if (Lexer->Token == '\0') {
- Status = RegexStatusBracketImbalance;
- goto ParseBracketExpressionEnd;
- }
- PreviousCharacter = (UCHAR)Lexer->Token;
- Status = ClpGetBracketExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseBracketExpressionEnd;
- }
- }
- break;
- case ']':
- Stop = TRUE;
- break;
- case '\0':
- Status = RegexStatusBracketImbalance;
- goto ParseBracketExpressionEnd;
- default:
- if (Lexer->Token > MAX_UCHAR) {
- Status = RegexStatusBadPattern;
- goto ParseBracketExpressionEnd;
- }
- //
- // If the previous character was a -, this is actually a range.
- // Pull the dash and first character off of the regular characters
- // list, and create a range.
- //
- if (PreviousCharacter == '-') {
- BracketEntry = malloc(sizeof(REGULAR_BRACKET_ENTRY));
- if (BracketEntry == NULL) {
- Status = RegexStatusNoMemory;
- goto ParseBracketExpressionEnd;
- }
- memset(BracketEntry, 0, sizeof(REGULAR_BRACKET_ENTRY));
- BracketEntry->Type = BracketExpressionRange;
- assert(RegularCharacters->Size >= 2);
- BracketEntry->U.Range.Minimum =
- RegularCharacters->Data[RegularCharacters->Size - 2];
- RegularCharacters->Size -= 2;
- BracketEntry->U.Range.Maximum = Lexer->Token;
- INSERT_BEFORE(&(BracketEntry->ListEntry),
- &(Entry->U.BracketExpression.EntryList));
- //
- // This is a regular character and not part of a range (or at least
- // the beginning character of the range. Add it to the regular
- // character string.
- //
- } else {
- Result = ClpAppendRegularExpressionString(RegularCharacters,
- (PSTR)&(Lexer->Token),
- 1);
- if (Result == FALSE) {
- Status = RegexStatusNoMemory;
- goto ParseBracketExpressionEnd;
- }
- }
- break;
- }
- if (Stop != FALSE) {
- break;
- }
- if (Lexer->Token < MAX_UCHAR) {
- PreviousCharacter = (CHAR)Lexer->Token;
- }
- Status = ClpGetBracketExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseBracketExpressionEnd;
- }
- }
- ParseBracketExpressionEnd:
- return Status;
- }
- REGULAR_EXPRESSION_STATUS
- ClpParseRegularExpressionDuplicationSymbol (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY Entry
- )
- /*++
- Routine Description:
- This routine parses any optional duplication symbols for the regular
- expression.
- Arguments:
- Lexer - Supplies a pointer to the initialized lexer.
- Expression - Supplies the expression to compile on.
- Entry - Supplies a pointer where the regular expression entry will be
- returned on success.
- Return Value:
- Regular expression status code.
- --*/
- {
- REGULAR_EXPRESSION_STATUS Status;
- BOOL SwallowToken;
- Status = RegexStatusSuccess;
- SwallowToken = FALSE;
- while (TRUE) {
- //
- // Stars are pretty easy, they're just zero or more occurrences.
- //
- if (Lexer->Token == '*') {
- Entry->DuplicateMin = 0;
- Entry->DuplicateMax = -1;
- SwallowToken = TRUE;
- //
- // Extended regular expressions are fancy in that they have a couple
- // extra duplication options.
- //
- } else if ((Expression->Flags & REG_EXTENDED) != 0) {
- //
- // Handle a +, which is one or more occurrences.
- //
- if (Lexer->Token == '+') {
- if (Entry->DuplicateMin > 1) {
- Entry->DuplicateMin = 1;
- }
- Entry->DuplicateMax = -1;
- SwallowToken = TRUE;
- //
- // Handle a ?, which is zero or one occurrences.
- //
- } else if (Lexer->Token == '?') {
- Entry->DuplicateMin = 0;
- Entry->DuplicateMax = 1;
- SwallowToken = TRUE;
- //
- // Handle an opening curly brace, which just like the escaped curly
- // brace in basic regular expressions in that it takes the form {M},
- // {M,}, or {M,N} specifying the minimum and maximum number of
- // occurrences (inclusive).
- //
- } else if (Lexer->Token == '{') {
- Status = ClpParseRegularExpressionDuplicationCount(Lexer,
- Expression,
- Entry);
- if (Status != RegexStatusSuccess) {
- goto ParseRegularExpressionDuplicationSymbolEnd;
- }
- } else {
- Status = RegexStatusSuccess;
- goto ParseRegularExpressionDuplicationSymbolEnd;
- }
- //
- // Basic expressions only. The only other duplication basic expressions
- // can do is backquote braces.
- //
- } else {
- if (Lexer->Token != TOKEN_ESCAPED_OPEN_BRACE) {
- Status = RegexStatusSuccess;
- goto ParseRegularExpressionDuplicationSymbolEnd;
- }
- //
- // Parse the duplication range.
- //
- Status = ClpParseRegularExpressionDuplicationCount(Lexer,
- Expression,
- Entry);
- if (Status != RegexStatusSuccess) {
- goto ParseRegularExpressionDuplicationSymbolEnd;
- }
- }
- if (SwallowToken != FALSE) {
- SwallowToken = FALSE;
- Status = ClpGetRegularExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ParseRegularExpressionDuplicationSymbolEnd;
- }
- }
- }
- ParseRegularExpressionDuplicationSymbolEnd:
- if (SwallowToken != FALSE) {
- assert(Status == RegexStatusSuccess);
- Status = ClpGetRegularExpressionToken(Lexer, Expression);
- }
- return Status;
- }
- REGULAR_EXPRESSION_STATUS
- ClpParseRegularExpressionDuplicationCount (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression,
- PREGULAR_EXPRESSION_ENTRY Entry
- )
- /*++
- Routine Description:
- This routine parses a duplication count, which takes the form "M", "M,", or
- "M,N", followed by either a close curly brace or close escaped curly brace
- depending on whether extended mode is on or not.
- Arguments:
- Lexer - Supplies a pointer to the initialized lexer.
- Expression - Supplies the expression to compile on.
- Entry - Supplies a pointer where the regular expression entry will be
- returned on success.
- Return Value:
- Regular expression status code.
- --*/
- {
- PSTR AfterScan;
- PSTR BeforeScan;
- LONG Begin;
- CHAR Character;
- LONG End;
- REGULAR_EXPRESSION_STATUS Status;
- End = Entry->DuplicateMax;
- //
- // Get the first number.
- //
- BeforeScan = Lexer->Input + Lexer->NextInput;
- Begin = strtol(BeforeScan, &AfterScan, 10);
- if ((Begin < 0) || (AfterScan == BeforeScan)) {
- Status = RegexStatusInvalidBraces;
- goto ClpParseRegularExpressionDuplicationCountEnd;
- }
- End = Begin;
- Lexer->NextInput += (UINTN)AfterScan - (UINTN)BeforeScan;
- assert(Lexer->NextInput <= Lexer->InputSize);
- while ((Lexer->NextInput < Lexer->InputSize) &&
- (isspace(Lexer->Input[Lexer->NextInput]))) {
- Lexer->NextInput += 1;
- }
- if (Lexer->NextInput == Lexer->InputSize) {
- Status = RegexStatusInvalidBraces;
- goto ClpParseRegularExpressionDuplicationCountEnd;
- }
- //
- // If there's a comma, swallow that and get an optional second number.
- //
- if (Lexer->Input[Lexer->NextInput] == ',') {
- Lexer->NextInput += 1;
- while ((Lexer->NextInput < Lexer->InputSize) &&
- (isspace(Lexer->Input[Lexer->NextInput]))) {
- Lexer->NextInput += 1;
- }
- if (Lexer->NextInput == Lexer->InputSize) {
- Status = RegexStatusInvalidBraces;
- goto ClpParseRegularExpressionDuplicationCountEnd;
- }
- Character = Lexer->Input[Lexer->NextInput];
- if ((Character >= '1') && (Character <= '9')) {
- BeforeScan = Lexer->Input + Lexer->NextInput;
- End = strtol(BeforeScan, &AfterScan, 10);
- if ((End < 0) || (AfterScan == BeforeScan)) {
- Status = RegexStatusInvalidBraces;
- goto ClpParseRegularExpressionDuplicationCountEnd;
- }
- Lexer->NextInput += (UINTN)AfterScan - (UINTN)BeforeScan;
- assert(Lexer->NextInput <= Lexer->InputSize);
- while ((Lexer->NextInput < Lexer->InputSize) &&
- (isspace(Lexer->Input[Lexer->NextInput]))) {
- Lexer->NextInput += 1;
- }
- if (Lexer->NextInput == Lexer->InputSize) {
- Status = RegexStatusInvalidBraces;
- goto ClpParseRegularExpressionDuplicationCountEnd;
- }
- //
- // In the {M,} form, the pattern matches at least M times with no
- // upper limit.
- //
- } else {
- End = -1;
- }
- }
- //
- // Now, get the next token and verify that it's a closing brace.
- //
- Status = ClpGetRegularExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ClpParseRegularExpressionDuplicationCountEnd;
- }
- if ((Expression->Flags & REG_EXTENDED) != 0) {
- if (Lexer->Token != '}') {
- Status = RegexStatusInvalidBraces;
- goto ClpParseRegularExpressionDuplicationCountEnd;
- }
- } else {
- if (Lexer->Token != TOKEN_ESCAPED_CLOSE_BRACE) {
- Status = RegexStatusInvalidBraces;
- goto ClpParseRegularExpressionDuplicationCountEnd;
- }
- }
- //
- // Swallow that ending token.
- //
- Status = ClpGetRegularExpressionToken(Lexer, Expression);
- if (Status != RegexStatusSuccess) {
- goto ClpParseRegularExpressionDuplicationCountEnd;
- }
- //
- // Watch out for a backwards range.
- //
- if ((End != -1) && (End < Begin)) {
- Status = RegexStatusInvalidBraces;
- goto ClpParseRegularExpressionDuplicationCountEnd;
- }
- Status = RegexStatusSuccess;
- ClpParseRegularExpressionDuplicationCountEnd:
- if (Status == RegexStatusSuccess) {
- if ((Begin < Entry->DuplicateMin) || (Entry->DuplicateMin == 1)) {
- Entry->DuplicateMin = Begin;
- }
- if ((Entry->DuplicateMax != -1) && (End > Entry->DuplicateMax)) {
- Entry->DuplicateMax = End;
- }
- }
- return Status;
- }
- REGULAR_EXPRESSION_STATUS
- ClpGetRegularExpressionToken (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression
- )
- /*++
- Routine Description:
- This routine gets the next token out of the regular expression input.
- Arguments:
- Lexer - Supplies a pointer to the initialized lexer. The token will be
- placed in the input lexer state.
- Expression - Supplies the expression to compile on.
- Return Value:
- Regular expression status code.
- --*/
- {
- CHAR Character;
- REGULAR_EXPRESSION_STATUS Status;
- Status = RegexStatusSuccess;
- //
- // Watch out for the end.
- //
- if ((Lexer->NextInput == Lexer->InputSize) ||
- (Lexer->Input[Lexer->NextInput] == '\0')) {
- Lexer->Token = '\0';
- goto GetRegularExpressionTokenEnd;
- }
- Character = Lexer->Input[Lexer->NextInput];
- Lexer->NextInput += 1;
- //
- // If it's just a regular character, send it on.
- //
- if (Character != '\\') {
- Lexer->Token = (UCHAR)Character;
- goto GetRegularExpressionTokenEnd;
- }
- //
- // If this was the end, that's a dangling escape.
- //
- if ((Lexer->NextInput == Lexer->InputSize) ||
- (Lexer->Input[Lexer->NextInput] == '\0')) {
- Status = RegexStatusTrailingEscape;
- goto GetRegularExpressionTokenEnd;
- }
- Character = Lexer->Input[Lexer->NextInput];
- Lexer->NextInput += 1;
- //
- // Back references work in both regular and extended.
- //
- if ((Character >= '1') && (Character <= '9')) {
- Lexer->Token = TOKEN_BACK_REFERENCE;
- goto GetRegularExpressionTokenEnd;
- //
- // Some quoted characters are common to both basic and extended regular
- // expressions.
- //
- } else if ((Character == '^') || (Character == '.') || (Character == '*') ||
- (Character == '[') || (Character == '$') || (Character == ']') ||
- (Character == '\\')) {
- Lexer->Token = TOKEN_QUOTED_CHARACTER;
- goto GetRegularExpressionTokenEnd;
- }
- //
- // Check for tokens specific to extended regular expressions.
- //
- if ((Expression->Flags & REG_EXTENDED) != 0) {
- //
- // Some quoted characters are only quoted in extended regular
- // expressions.
- //
- if ((Character == '(') || (Character == ')') || (Character == '|') ||
- (Character == '+') || (Character == '?') || (Character == '{') ||
- (Character == '}')) {
- Lexer->Token = TOKEN_QUOTED_CHARACTER;
- goto GetRegularExpressionTokenEnd;
- }
- //
- // Basic regular expression syntax only.
- //
- } else {
- if (Character == '(') {
- Lexer->Token = TOKEN_ESCAPED_OPEN_PARENTHESES;
- goto GetRegularExpressionTokenEnd;
- } else if (Character == ')') {
- Lexer->Token = TOKEN_ESCAPED_CLOSE_PARENTHESES;
- goto GetRegularExpressionTokenEnd;
- } else if (Character == '{') {
- Lexer->Token = TOKEN_ESCAPED_OPEN_BRACE;
- goto GetRegularExpressionTokenEnd;
- } else if (Character == '}') {
- Lexer->Token = TOKEN_ESCAPED_CLOSE_BRACE;
- goto GetRegularExpressionTokenEnd;
- }
- }
- //
- // If it's quoting a special character, back up and send the backslash
- // through directly.
- //
- if ((Character == '0') || (Character == 'n') || (Character == 'r') ||
- (Character == 'f') || (Character == 't') || (Character == 'v') ||
- (Character == 'b') || (Character == 'a')) {
- Lexer->NextInput -= 1;
- Lexer->Token = '\\';
- goto GetRegularExpressionTokenEnd;
- }
- //
- // This backslash doesn't seem to be quoting anything. Just serve up the
- // next character.
- //
- Lexer->Token = (CHAR)Character;
- GetRegularExpressionTokenEnd:
- return Status;
- }
- REGULAR_EXPRESSION_STATUS
- ClpGetBracketExpressionToken (
- PREGULAR_EXPRESSION_LEXER Lexer,
- PREGULAR_EXPRESSION Expression
- )
- /*++
- Routine Description:
- This routine gets the next token out of the regular expression input.
- Arguments:
- Lexer - Supplies a pointer to the initialized lexer. The token will be
- placed in the input lexer state.
- Expression - Supplies the expression to compile on.
- Return Value:
- Regular expression status code.
- --*/
- {
- CHAR Character;
- REGULAR_EXPRESSION_STATUS Status;
- Status = RegexStatusSuccess;
- //
- // Watch out for the end.
- //
- if ((Lexer->NextInput == Lexer->InputSize) ||
- (Lexer->Input[Lexer->NextInput] == '\0')) {
- Lexer->Token = '\0';
- goto GetBracketExpressionTokenEnd;
- }
- Character = Lexer->Input[Lexer->NextInput];
- Lexer->NextInput += 1;
- //
- // If it's just a regular character, send it on.
- //
- if (Character != '[') {
- Lexer->Token = (UCHAR)Character;
- goto GetBracketExpressionTokenEnd;
- }
- //
- // If this was the end, that's a dangling open bracket.
- //
- if ((Lexer->NextInput == Lexer->InputSize) ||
- (Lexer->Input[Lexer->NextInput] == '\0')) {
- Status = RegexStatusBracketImbalance;
- goto GetBracketExpressionTokenEnd;
- }
- Character = Lexer->Input[Lexer->NextInput];
- if (Character == '=') {
- Lexer->Token = TOKEN_OPEN_EQUAL;
- Lexer->NextInput += 1;
- } else if (Character == '.') {
- Lexer->Token = TOKEN_OPEN_DOT;
- Lexer->NextInput += 1;
- } else if (Character == ':') {
- Lexer->Token = TOKEN_OPEN_COLON;
- Lexer->NextInput += 1;
- } else {
- //
- // This is just a plain Jane open bracket.
- //
- Lexer->Token = '[';
- }
- GetBracketExpressionTokenEnd:
- return Status;
- }
- PREGULAR_EXPRESSION_ENTRY
- ClpCreateRegularExpressionEntry (
- REGEX_ENTRY_TYPE Type
- )
- /*++
- Routine Description:
- This routine creates a new regular expression entry of the given type.
- Arguments:
- Type - Supplies the type of entry to create.
- Return Value:
- Returns a pointer to the created entry on success.
- NULL on allocation failure.
- --*/
- {
- PREGULAR_EXPRESSION_ENTRY Entry;
- Entry = malloc(sizeof(REGULAR_EXPRESSION_ENTRY));
- if (Entry == NULL) {
- return NULL;
- }
- memset(Entry, 0, sizeof(REGULAR_EXPRESSION_ENTRY));
- Entry->Type = Type;
- Entry->DuplicateMin = 1;
- Entry->DuplicateMax = 1;
- INITIALIZE_LIST_HEAD(&(Entry->ChildList));
- return Entry;
- }
- VOID
- ClpDestroyRegularExpressionEntry (
- PREGULAR_EXPRESSION_ENTRY Entry
- )
- /*++
- Routine Description:
- This routine destroys a regular expression entry and recursively all of its
- children. It assumes the entry has already been removed from any parent
- list.
- Arguments:
- Entry - Supplies a pointer to the entry to destroy.
- Return Value:
- None.
- --*/
- {
- PREGULAR_BRACKET_ENTRY BracketEntry;
- PREGULAR_EXPRESSION_ENTRY Subentry;
- switch (Entry->Type) {
- case RegexEntryOrdinaryCharacters:
- if (Entry->U.String.Data != NULL) {
- free(Entry->U.String.Data);
- }
- break;
- case RegexEntryInvalid:
- case RegexEntryAnyCharacter:
- case RegexEntryBackReference:
- case RegexEntryStringBegin:
- case RegexEntryStringEnd:
- case RegexEntryBranch:
- case RegexEntryBranchOption:
- case RegexEntrySubexpression:
- case RegexEntryStartOfWord:
- case RegexEntryEndOfWord:
- break;
- case RegexEntryBracketExpression:
- if (Entry->U.BracketExpression.RegularCharacters.Data != NULL) {
- free(Entry->U.BracketExpression.RegularCharacters.Data);
- }
- while (LIST_EMPTY(&(Entry->U.BracketExpression.EntryList)) == FALSE) {
- BracketEntry = LIST_VALUE(Entry->U.BracketExpression.EntryList.Next,
- REGULAR_BRACKET_ENTRY,
- ListEntry);
- LIST_REMOVE(&(BracketEntry->ListEntry));
- free(BracketEntry);
- }
- break;
- default:
- assert(FALSE);
- return;
- }
- while (LIST_EMPTY(&(Entry->ChildList)) == FALSE) {
- Subentry = LIST_VALUE(Entry->ChildList.Next,
- REGULAR_EXPRESSION_ENTRY,
- ListEntry);
- LIST_REMOVE(&(Subentry->ListEntry));
- ClpDestroyRegularExpressionEntry(Subentry);
- }
- free(Entry);
- return;
- }
- BOOL
- ClpAppendRegularExpressionString (
- PREGULAR_EXPRESSION_STRING String,
- PSTR Data,
- ULONG Size
- )
- /*++
- Routine Description:
- This routine appends data onto a regular expression string.
- Arguments:
- String - Supplies a pointer to the string to append to.
- Data - Supplies a pointer to the data to append.
- Size - Supplies the number of bytes in the data buffer.
- Return Value:
- TRUE on success.
- FALSE on allocation failure.
- --*/
- {
- PVOID NewBuffer;
- ULONG NewCapacity;
- //
- // Reallocate the string if needed.
- //
- if (String->Size + Size > String->Capacity) {
- NewCapacity = String->Capacity;
- if (NewCapacity == 0) {
- NewCapacity = REGULAR_EXPRESSION_INITIAL_STRING_SIZE;
- }
- while (String->Size + Size > NewCapacity) {
- NewCapacity *= 2;
- }
- NewBuffer = realloc(String->Data, NewCapacity);
- if (NewBuffer == NULL) {
- String->Capacity = 0;
- return FALSE;
- }
- String->Data = NewBuffer;
- String->Capacity = NewCapacity;
- }
- //
- // Copy the new bytes in.
- //
- memcpy(String->Data + String->Size, Data, Size);
- String->Size += Size;
- return TRUE;
- }
|