/*++ Copyright (c) 2015 Minoca Corp. This file is licensed under the terms of the GNU General Public License version 3. Alternative licensing terms are available. Contact info@minocacorp.com for details. See the LICENSE file at the root of this project for complete licensing information. Module Name: yy.h Abstract: This header contains definitions for the basic Lexer/Parser library. Author: Evan Green 9-Oct-2015 --*/ // // ------------------------------------------------------------------- Includes // // // ---------------------------------------------------------------- Definitions // #ifndef YY_API #define YY_API __DLLIMPORT #endif // // Define lexer flags. // #define YY_LEX_FLAG_IGNORE_UNKNOWN 0x00000001 // // Define parser flags. // // // Set this flag to debug print every node the parser is attempting to parser. // #define YY_PARSE_FLAG_DEBUG 0x00000001 // // Set this flag to debug match successes. // #define YY_PARSE_FLAG_DEBUG_MATCHES 0x00000002 // // Set this flag to debug match failures (produces a lot of output). // #define YY_PARSE_FLAG_DEBUG_NON_MATCHES 0x00000004 // // Define parser grammar element flags. // // // Set this flag to replace the given element with its child node if there is // only one node and zero tokens. // #define YY_GRAMMAR_COLLAPSE_ONE 0x00000001 // // Set this flag to indicate that additional matches should not be added on // the end of a left recursive rule list-style, but should instead be nested // nodes. // #define YY_GRAMMAR_NEST_LEFT_RECURSION 0x00000002 // // ------------------------------------------------------ Data Type Definitions // // // LALR parser definitions. // typedef enum _YY_STATUS { YyStatusSuccess, YyStatusNoMemory, YyStatusTooManyItems, YyStatusInvalidSpecification, YyStatusInvalidParameter, YyStatusParseError, YyStatusLexError, } YY_STATUS, *PYY_STATUS; typedef SHORT YY_VALUE, *PYY_VALUE; typedef PVOID (*PYY_REALLOCATE) ( PVOID Context, PVOID Allocation, UINTN Size ); /*++ Routine Description: This routine is called to allocate, reallocate, or free memory. Arguments: Context - Supplies a pointer to the context passed into the parser. Allocation - Supplies an optional pointer to an existing allocation to either reallocate or free. If NULL, then a new allocation is being requested. Size - Supplies the size of the allocation request, in bytes. If this is non-zero, then an allocation or reallocation is being requested. If this is is 0, then the given memory should be freed. Return Value: Returns a pointer to the allocated memory on success. NULL on allocation failure or free. --*/ typedef INT (*PYY_PARSER_CALLBACK) ( PVOID Context, YY_VALUE Symbol, PVOID Elements, INT ElementCount, PVOID ReducedElement ); /*++ Routine Description: This routine is called for each grammar element that is successfully parsed. Arguments: Context - Supplies the context pointer supplied to the parse function. Symbol - Supplies the non-terminal symbol that was reduced. Elements - Supplies a pointer to an array of values containing the child elements in the rule. The function should know how many child elements there are based on the rule. ElementCount - Supplies the number of elements in the right hand side of this rule. ReducedElement - Supplies a pointer where the function can specify the reduction value to push back on the stack. If untouched, the default action is to push the first element. When called, the buffer will be set up for that, or zeroed if the rule is empty. Return Value: 0 on success. Returns a non-zero value if the parser should abort and return. --*/ typedef YY_STATUS (*PYY_PARSER_ERROR) ( PVOID Context, YY_STATUS Status ); /*++ Routine Description: This routine is called if the parser reaches an error state. Arguments: Context - Supplies the context pointer supplied to the parse function. Status - Supplies the error that occurred, probably a parse error. Return Value: 0 if the parser should try to recover. Returns a non-zero value if the parser should abort immediately and return. --*/ typedef INT (*PYY_PARSER_GET_TOKEN) ( PVOID Lexer, PYY_VALUE Value ); /*++ Routine Description: This routine is called to get a new token from the input. Arguments: Lexer - Supplies a pointer to the lexer context. Value - Supplies a pointer where the new token will be returned. The token data may be larger than simply a value, but it returns at least a value. Return Value: 0 on success, including EOF. Returns a non-zero value if there was an error reading the token. --*/ /*++ Structure Description: This structure stores a compiled grammar definition. These tables are machine generated by the yygen library. Members: LeftSide - Stores a pointer to an array of the left side symbol of each rule. Length - Stores a pointer to an array that describes how many elements are in each rule. DefaultReductions - Stores a pointer to an array of states to reduce to automatically. ShiftIndex - Stores an array of pointers indexed by state into the giant table of state transitions. ReduceIndex - Stores an array of pointers index by state into the giant table of state transitions. GotoIndex - Stores the goto array. Table - Stores the giant serialized table of state transitions to make. Check - Stores the giant serialized table of source states or shift symbols corresponding to each element in the table. DefaultGotos - Stores the condensed table of the most common goto for each state, which helps reduce the size of the goto table. TableSize - Stores the maximum valid element index in the table. (This index is inclusive). Names - Stores an array of strings containing the name of each terminal and non-terminal. Rules - Stores an array of strings describing each rule. FinalState - Stores the accept state index. FinalSymbol - Stores the symbol whose receipt indicates acceptance (ie can be followed by EOF). MaxToken - Stores the number of terminals in the grammar. UndefinedToken - Stores the token value in the names array that corresponds to an undefined token. --*/ typedef struct _YY_GRAMMAR { const YY_VALUE *LeftSide; const YY_VALUE *RuleLength; const YY_VALUE *DefaultReductions; const YY_VALUE *ShiftIndex; const YY_VALUE *ReduceIndex; const YY_VALUE *GotoIndex; const YY_VALUE *Table; const YY_VALUE *Check; const YY_VALUE *DefaultGotos; YY_VALUE TableSize; const char **Names; const char **Rules; YY_VALUE FinalState; YY_VALUE FinalSymbol; YY_VALUE MaxToken; YY_VALUE UndefinedToken; } YY_GRAMMAR, *PYY_GRAMMAR; /*++ Structure Description: This structure stores the context for an LALR(1) parser. Members: Grammar - Stores a pointer to the compiled grammar data. The yygen library should be used at build time to create this data. Reallocate - Stores a pointer to a function used to allocate, reallocate, and free memory from the heap. Callback - Stores a pointer to a function to call for each new grammar symbol parsed. Error - Stores a pointer to a function to call if the parser hits an error. Context - Stores a pointer to blindly pass into the callback function. Lexer - Stores a pointer to the initialized lexer, ready to emit tokens. GetToken - Stores a pointer to a function used to get the next lexer token. ValueSize - Stores the size in bytes of a single token or symbol. This must be at least as large as a YY_VALUE. ErrorCount - Stores the number of parsing errors that occured. Debug - Stores a boolean indicating whether or not to enable debug prints on the parser. DebugPrefix - Stores the prefix to prepend to every debug print within the parser. If this is non-null, debug prints are enabled for the parser. --*/ typedef struct _YY_PARSER { PYY_GRAMMAR Grammar; PYY_REALLOCATE Reallocate; PYY_PARSER_CALLBACK Callback; PYY_PARSER_ERROR Error; PVOID Context; PVOID Lexer; PYY_PARSER_GET_TOKEN GetToken; UINTN ValueSize; UINTN ErrorCount; PSTR DebugPrefix; } YY_PARSER, *PYY_PARSER; // // Lexer definitions // /*++ Structure Description: This structure stores the state for the lexer. To initialize this, zero it out. Members: Flags - Stores a bitfield of flags governing the lexer behavior. See YY_LEX_FLAG_* definitions. Allocate - Stores a pointer to a function used to allocate memory. Free - Stores a pointer to a function used to free allocated memory. Input - Stores the input buffer to lex. InputSize - Stores the size of the input buffer in bytes, including the null terminator if present. Position - Stores the current character position. Line - Stores the current one-based line number. Column - Stores the zero-based column number. TokenCount - Stores the number of tokens processed so far. LargestToken - Stores the size member of the largest single token seen so far. Note that this does not include space for a null terminator. TokenStringsSize - Stores the total number of bytes to allocate for strings for all tokens seen so far, including a null terminator on each one. Literals - Stores a pointer to a null-terminated string containing characters to pass through literally as individual tokens. Expressions - Stores an array of expression strings to match against. This must be terminated by a NULL entry. IgnoreExpressions - Stores an array of expression strings that should not produce tokens if they match. This is a place to put things like comment expressions. ExpressionNames - Stores an optional pointer to an array of strings that names each of the expressions. Useful for debugging, but not mandatory. TokenBase - Stores the value to assign for the first expression. 512 is usually a good value, as it won't alias with the literal characters. --*/ typedef struct _LEXER { ULONG Flags; PCSTR Input; ULONG InputSize; ULONG Position; ULONG Line; ULONG Column; ULONG TokenCount; ULONG LargestToken; ULONG TokenStringsSize; PSTR Literals; PSTR *Expressions; PSTR *IgnoreExpressions; PSTR *ExpressionNames; ULONG TokenBase; } LEXER, *PLEXER; /*++ Structure Description: This structure stores a lexer token. Members: Value - Stores the lexer token value. This may be a literal byte or a token value. Position - Stores the position of the token. Size - Stores the number of characters in the token. Line - Stores the line number of start of the token. Column - Stores the column number of the start of the token. String - Stores a pointer to the string of input text this token corresponds to. The lexer does not fill this out, but the member is provided here for convenience. --*/ typedef struct _LEXER_TOKEN { ULONG Value; ULONG Position; ULONG Size; ULONG Line; ULONG Column; PSTR String; } LEXER_TOKEN, *PLEXER_TOKEN; // // Parser definitions // typedef struct _PARSER_NODE PARSER_NODE, *PPARSER_NODE; typedef PVOID (*PYY_ALLOCATE) ( UINTN Size ); /*++ Routine Description: This routine is called when the lex/parse library needs to allocate memory. Arguments: Size - Supplies the size of the allocation request, in bytes. Return Value: Returns a pointer to the allocation if successful, or NULL if the allocation failed. --*/ typedef VOID (*PYY_FREE) ( PVOID Memory ); /*++ Routine Description: This routine is called when the lex/parse library needs to free allocated memory. Arguments: Memory - Supplies the allocation returned by the allocation routine. Return Value: None. --*/ typedef KSTATUS (*PYY_GET_TOKEN) ( PVOID Context, PLEXER_TOKEN Token ); /*++ Routine Description: This routine gets the next token for the parser. Arguments: Context - Supplies a context pointer initialized in the parser. Token - Supplies a pointer where the next token will be filled out and returned. Return Value: STATUS_SUCCESS on success. STATUS_END_OF_FILE if the file end was reached. STATUS_MALFORMED_DATA_STREAM if the given input matched no rule in the lexer and the lexer was not configured to ignore such things. --*/ typedef VOID (*PYY_NODE_CALLBACK) ( PVOID Context, PPARSER_NODE Node, BOOL Create ); /*++ Routine Description: This routine is called when a node is being created or destroyed. This callback must be prepared to create and destroy a node multiple times, as recursive descent parsers explore paths that ultimately prove to be incorrect. Unless the parser feeds back into the lexer (like in C), it is not recommended to use this callback. Arguments: Context - Supplies a context pointer initialized in the parser. Node - Supplies a pointer to the node being created or destroyed. Create - Supplies a boolean indicating if the node is being created (TRUE) or destroyed (FALSE). Return Value: None. --*/ /*++ Structure Description: This structure defines a grammar element in the parser grammar. Members: Name - Stores an optional pointer to the name of this grammar element. This is not used during parsing, but can be useful during debugging. Flags - Stores a bitfield of flags about this node. See YY_GRAMMAR_* definitions. Components - Stores a sequence of rule elements. Each element is either a token value or a rule value, determined by the grammar base and grammar end values in the parser. Each form of a grammar expression is terminated by a zero value. The next alternate form starts after the zero. Terminate the sequence with an additional zero to end the node form. --*/ typedef struct _PARSER_GRAMMAR_ELEMENT { PSTR Name; ULONG Flags; PULONG Components; } PARSER_GRAMMAR_ELEMENT, *PPARSER_GRAMMAR_ELEMENT; /*++ Structure Description: This structure stores a parsed node. Members: GrammarElement - Stores the type of grammar element this node represents. GrammarIndex - Stores the index of the rule that applied for this grammar node. StartToken - Stores a pointer to the token where parsing of this node began. Tokens - Stores a pointer to the tokens in the node. Nodes - Stores a pointer to the child nodes in the node. In the free list, the first element stores the pointer to the next element in the free list. TokenCount - Stores the number of valid tokens in the token array. NodeCount - Stores the number of vaild nodes in the node array. TokenCapacity - Stores the maximum number of tokens the array can store before it must be reallocated. NodeCapacity - Stores the maximum number of nodes the array can store before it must be reallocated. --*/ struct _PARSER_NODE { ULONG GrammarElement; ULONG GrammarIndex; PLEXER_TOKEN StartToken; PLEXER_TOKEN *Tokens; PPARSER_NODE *Nodes; ULONG TokenCount; ULONG NodeCount; ULONG TokenCapacity; ULONG NodeCapacity; }; /*++ Structure Description: This structure stores the state for the parser. To initialize this, zero it out and call the initialize function. Members: Flags - Stores a bitfield of flags governing the lexer behavior. See YY_LEX_FLAG_* definitions. Context - Stores a context pointer that is passed to the get token function. Allocate - Stores a pointer to a function used allocate memory. Free - Stores a pointer to a function used to free memory. GetToken - Stores a pointer to a function used to get the next lexical token. NodeCallback - Stores an optional pointer to a function called when nodes are created or destroyed. Note that this callback needs to be prepared to create and destroy nodes potentially multiple times, as recursive descent parsers explore paths that may ultimately not be correct. Use of this callback is not recommended unless required (for languages like C where the parser feeds back into the lexer). Grammar - Stores a pointer to the grammar, which is defined as an array of grammar elements. GrammarBase - Stores the start of the range of component values that specify grammar elements themselves. GrammarEnd - Stores the end index of grammar elements, exclusive. Every rule component outside the range of grammar base to grammar size is assumed to be a lexer token. GrammarStart - Stores the starting element to parse. MaxRecursion - Stores the maximum allowed recursion depth. Supply 0 to allow infinite recursion. Lexer - Stores an optional pointer to the lexer, which can be used to print token names during debug. TokenArrays - Stores a pointer to an array of ever-doubling arrays of lexer tokens. TokenCount - Stores the total number of tokens stored in the token arrays. TokenCapacity - Stores the total number of tokens that can fit in the arrays before they will need to be resized. NextTokenIndex - Stores the next token index to process. NextToken - Stores a pointer to the next token, for fast access. FreeNodes - Stores a pointer to the singly linked list of free nodes. RecursionDepth - Stores the current recursion depth. --*/ typedef struct _PARSER { ULONG Flags; PVOID Context; PYY_ALLOCATE Allocate; PYY_FREE Free; PYY_GET_TOKEN GetToken; PYY_NODE_CALLBACK NodeCallback; PARSER_GRAMMAR_ELEMENT *Grammar; ULONG GrammarBase; ULONG GrammarEnd; ULONG GrammarStart; ULONG MaxRecursion; PLEXER Lexer; PLEXER_TOKEN *TokenArrays; ULONG TokenCount; ULONG TokenCapacity; ULONG NextTokenIndex; PLEXER_TOKEN NextToken; PPARSER_NODE FreeNodes; ULONG RecursionDepth; } PARSER, *PPARSER; // // -------------------------------------------------------------------- Globals // // // -------------------------------------------------------- Function Prototypes // // // LALR parser functions. // YY_API YY_STATUS YyParseGrammar ( PYY_PARSER Parser ); /*++ Routine Description: This routine parses input according to an LALR(1) compiled grammar. Arguments: Parser - Supplies a pointer to the initialized parser context. Return Value: 0 on success. Returns a non-zero value if the parsing failed, the lexer failed, or the callback failed. --*/ // // Lexer functions // YY_API KSTATUS YyLexInitialize ( PLEXER Lexer ); /*++ Routine Description: This routine initializes a lexer. Arguments: Lexer - Supplies a pointer to the lexer to initialize. Return Value: STATUS_SUCCESS on success. STATUS_INVALID_PARAMETER if the required fields of the lexer are not filled in. --*/ YY_API KSTATUS YyLexGetToken ( PLEXER Lexer, PLEXER_TOKEN Token ); /*++ Routine Description: This routine gets the next token from the lexer. Arguments: Lexer - Supplies a pointer to the lexer. Token - Supplies a pointer where the next token will be filled out and returned. Return Value: STATUS_SUCCESS on success. STATUS_END_OF_FILE if the file end was reached. STATUS_MALFORMED_DATA_STREAM if the given input matched no rule in the lexer and the lexer was not configured to ignore such things. --*/ // // Parser functions // YY_API KSTATUS YyParserInitialize ( PPARSER Parser ); /*++ Routine Description: This routine initializes a parser. This routine assumes the parser was initially zeroed out, and is currently being initialized or reset after use. Arguments: Parser - Supplies a pointer to the parser to initialize. Return Value: STATUS_SUCCESS on success. STATUS_INVALID_PARAMETER if the required fields of the lexer are not filled in. --*/ YY_API VOID YyParserReset ( PPARSER Parser ); /*++ Routine Description: This routine resets a parser, causing it to return to its initial state but not forget the tokens it has seen. Arguments: Parser - Supplies a pointer to the parser to reset. Return Value: None. --*/ YY_API VOID YyParserDestroy ( PPARSER Parser ); /*++ Routine Description: This routine frees all the resources associated with a given parser. Arguments: Parser - Supplies a pointer to the parser. Return Value: None. --*/ YY_API KSTATUS YyParse ( PPARSER Parser, PPARSER_NODE *Tree ); /*++ Routine Description: This routine attempts to parse input grammatically based on a set of grammar rules and lexer input tokens. It will build an abstract syntax tree, which will be returned as a node. Arguments: Parser - Supplies a pointer to the parser. The caller must have filled in the grammar rules and support function pointers. Tree - Supplies a pointer where the abstract syntax tree will be returned. Return Value: STATUS_SUCCESS on success. STATUS_INSUFFICIENT_RESOURCES on allocation failure. STATUS_INVALID_SEQUENCE upon parse failure. Other errors on other failures (such as lexer errors). --*/ YY_API VOID YyDestroyNode ( PPARSER Parser, PPARSER_NODE Node ); /*++ Routine Description: This routine destroys a parser node. Arguments: Parser - Supplies a pointer to the parser. Node - Supplies a pointer to the node to destroy. Return Value: None. --*/