Files
chess-games/pgn-extract/lex.c
2024-01-22 07:30:05 +01:00

1683 lines
52 KiB
C

/*
* This file is part of pgn-extract: a Portable Game Notation (PGN) extractor.
* Copyright (C) 1994-2022 David J. Barnes
*
* pgn-extract is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* pgn-extract is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with pgn-extract. If not, see <http://www.gnu.org/licenses/>.
*
* David J. Barnes may be contacted as d.j.barnes@kent.ac.uk
* https://www.cs.kent.ac.uk/people/staff/djb/
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#if defined(__BORLANDC__) || defined(_MSC_VER)
#include <io.h>
#ifndef R_OK
#define R_OK 0
#endif
#else
#include <unistd.h>
#endif
#include "bool.h"
#include "mymalloc.h"
#include "defs.h"
#include "typedef.h"
#include "tokens.h"
#include "taglist.h"
#include "lex.h"
#include "moves.h"
#include "lists.h"
#include "decode.h"
#include "lines.h"
#include "grammar.h"
#include "apply.h"
#include "output.h"
/* Prototypes for the functions in this file. */
static Boolean extract_yytext(const unsigned char *symbol_start,
const unsigned char *linep);
static int identify_tag(const char *tag_string);
static TagName make_new_tag(const char *tag);
static Boolean open_input(const char *infile);
static Boolean open_input_file(int file_number);
/* When a move is saved, what is known of its source and destination coordinates
* should also be saved.
*/
static void save_k_castle(void);
static void save_move(const unsigned char *move);
static void save_q_castle(void);
static void save_string(const char *result);
static void terminate_input(void);
static unsigned long line_number = 0;
/* Keep track of the Recursive Annotation Variation level. */
static unsigned RAV_level = 0;
/* Keep track of the last move found. */
static unsigned char last_move[MAX_MOVE_LEN + 1];
/* How many games we have extracted from this file. */
static unsigned games_in_file = 0;
/* Provide an input file pointer.
* This is intialised in init_lex_tables.
*/
static FILE *yyin = NULL;
/* Define space for holding matched tokens. */
#define MAX_YYTEXT 100
static unsigned char yytext[MAX_YYTEXT + 1];
YYSTYPE yylval;
#define MAX_CHAR 256
#define ALPHA_DIST ('a'-'A')
/* Table of symbol classifications. */
static TokenType ChTab[MAX_CHAR];
/* A boolean array as to whether a character is allowed in a move or not. */
static short MoveChars[MAX_CHAR];
/* Define a table to hold the list of tag strings.
* This is initialised in init_list_of_known_tags().
* As new tags are encountered, the list is expanded,
* and tag_list_length increased.
*/
static const char **TagList;
static unsigned tag_list_length = 0;
/* Which tags, if any, are to be suppressed in the output.
* The indices are the same as for TagList.
*/
static Boolean *suppressed_tags;
/* Nested comment depth: GlobalState.allow_nested_comments. */
static unsigned comment_depth = 0;
/* Initialise the TagList. This should be stored in alphabetical order,
* by virtue of the order in which the _TAG values are defined.
*/
static void
init_list_of_known_tags(void)
{
unsigned i;
tag_list_length = ORIGINAL_NUMBER_OF_TAGS;
TagList = (const char **) malloc_or_die(tag_list_length * sizeof (*TagList));
/* FALSE by default. */
suppressed_tags = (Boolean *) malloc_or_die(tag_list_length * sizeof(*suppressed_tags));
/* Be paranoid and put a string in every entry. */
for (i = 0; i < tag_list_length; i++) {
TagList[i] = "";
suppressed_tags[i] = FALSE;
}
TagList[ANNOTATOR_TAG] = "Annotator";
TagList[BLACK_TAG] = "Black";
TagList[BLACK_ELO_TAG] = "BlackElo";
TagList[BLACK_NA_TAG] = "BlackNA";
TagList[BLACK_TITLE_TAG] = "BlackTitle";
TagList[BLACK_TYPE_TAG] = "BlackType";
TagList[BLACK_USCF_TAG] = "BlackUSCF";
TagList[BOARD_TAG] = "Board";
TagList[DATE_TAG] = "Date";
TagList[ECO_TAG] = "ECO";
TagList[PSEUDO_ELO_TAG] = "Elo";
TagList[EVENT_TAG] = "Event";
TagList[EVENT_DATE_TAG] = "EventDate";
TagList[EVENT_SPONSOR_TAG] = "EventSponsor";
TagList[FEN_TAG] = "FEN";
TagList[PSEUDO_FEN_PATTERN_TAG] = "FENPattern";
TagList[PSEUDO_FEN_PATTERN_I_TAG] = "FENPatternI";
TagList[HASHCODE_TAG] = "HashCode";
TagList[LONG_ECO_TAG] = "LongECO";
TagList[MATCHLABEL_TAG] = "MatchLabel";
TagList[MATERIAL_MATCH_TAG] = "MaterialMatch";
TagList[MODE_TAG] = "Mode";
TagList[NIC_TAG] = "NIC";
TagList[OPENING_TAG] = "Opening";
TagList[PSEUDO_PLAYER_TAG] = "Player";
TagList[PLY_COUNT_TAG] = "PlyCount";
TagList[RESULT_TAG] = "Result";
TagList[ROUND_TAG] = "Round";
TagList[SECTION_TAG] = "Section";
TagList[SETUP_TAG] = "SetUp";
TagList[SITE_TAG] = "Site";
TagList[STAGE_TAG] = "Stage";
TagList[SUB_VARIATION_TAG] = "SubVariation";
TagList[TERMINATION_TAG] = "Termination";
TagList[TIME_TAG] = "Time";
TagList[TIME_CONTROL_TAG] = "TimeControl";
TagList[TOTAL_PLY_COUNT_TAG] = "TotalPlyCount";
TagList[UTC_DATE_TAG] = "UTCDate";
TagList[UTC_TIME_TAG] = "UTCTime";
TagList[VARIANT_TAG] = "Variant";
TagList[VARIATION_TAG] = "Variation";
TagList[WHITE_TAG] = "White";
TagList[WHITE_ELO_TAG] = "WhiteElo";
TagList[WHITE_NA_TAG] = "WhiteNA";
TagList[WHITE_TITLE_TAG] = "WhiteTitle";
TagList[WHITE_TYPE_TAG] = "WhiteType";
TagList[WHITE_USCF_TAG] = "WhiteUSCF";
}
/* Extend TagList to accomodate a new tag string.
* Return the current value of tag_list_length as its
* index, having incremented its value.
*/
static TagName
make_new_tag(const char *tag)
{
unsigned tag_index = tag_list_length;
tag_list_length++;
TagList = (const char **) realloc_or_die((void *) TagList,
tag_list_length * sizeof (*TagList));
suppressed_tags = (Boolean *) realloc_or_die(
(void *) suppressed_tags,
tag_list_length * sizeof(*suppressed_tags));
TagList[tag_index] = copy_string(tag);
suppressed_tags[tag_index] = FALSE;
/* Ensure that the game header's tags array can accommodate
* the new tag.
*/
increase_game_header_tags_length(tag_list_length);
return tag_index;
}
const char *
tag_header_string(TagName tag)
{
if (tag < tag_list_length) {
return TagList[tag];
}
else {
fprintf(GlobalState.logfile, "Internal error in tag_header_string(%d)\n",
tag);
exit(1);
return NULL;
}
}
Boolean
is_suppressed_tag(TagName tag)
{
if (tag < tag_list_length) {
return suppressed_tags[tag];
}
else {
fprintf(GlobalState.logfile, "Internal error in is_suppressed_tag(%d)\n",
tag);
exit(1);
return FALSE;
}
}
/* Don't include the given tag on output. */
void
suppress_tag(const char *tag_string)
{
int tag_item = identify_tag(tag_string);
if (tag_item < 0) {
tag_item = make_new_tag(tag_string);
}
suppressed_tags[tag_item] = TRUE;
}
/* Initialise ChTab[], the classification of the initial characters
* of symbols.
* Initialise MoveChars, the classification of secondary characters
* of moves.
*/
void
init_lex_tables(void)
{
int i;
/* Assume standard input will be used, until we know otherwise. */
yyin = stdin;
init_list_of_known_tags();
/* Initialise ChTab[]. */
for (i = 0; i < MAX_CHAR; i++) {
ChTab[i] = ERROR_TOKEN;
}
ChTab[' '] = WHITESPACE;
ChTab['\t'] = WHITESPACE;
ChTab['\r'] = WHITESPACE;
ChTab['['] = TAG_START;
ChTab[']'] = TAG_END;
ChTab['"'] = DOUBLE_QUOTE;
ChTab['{'] = COMMENT_START;
ChTab['}'] = COMMENT_END;
ChTab['$'] = NAG;
ChTab['!'] = ANNOTATE;
ChTab['?'] = ANNOTATE;
ChTab['+'] = CHECK_SYMBOL;
ChTab['#'] = CHECK_SYMBOL;
ChTab['.'] = DOT;
ChTab['('] = RAV_START;
ChTab[')'] = RAV_END;
ChTab['%'] = PERCENT;
ChTab[';'] = SEMICOLON;
ChTab['\\'] = ESCAPE;
ChTab['\0'] = EOS;
ChTab['*'] = STAR;
ChTab['-'] = DASH;
ChTab['/'] = SLASH;
/* Operators allowed only in the tag file. */
ChTab['<'] = OPERATOR;
ChTab['>'] = OPERATOR;
ChTab['='] = OPERATOR; /* Overloaded in MoveChars. */
for (i = '0'; i <= '9'; i++) {
ChTab[i] = DIGIT;
}
for (i = 'A'; i <= 'Z'; i++) {
ChTab[i] = ALPHA;
ChTab[i + ALPHA_DIST] = ALPHA;
}
ChTab['_'] = ALPHA;
/* Classify the Russian piece letters as ALPHA. */
ChTab[RUSSIAN_KNIGHT_OR_KING] = ALPHA; /* King and Knight. */
ChTab[RUSSIAN_KING_SECOND_LETTER] = ALPHA; /* King (second character). */
ChTab[RUSSIAN_QUEEN] = ALPHA; /* Queen. */
ChTab[RUSSIAN_ROOK] = ALPHA; /* Rook. */
ChTab[RUSSIAN_BISHOP] = ALPHA; /* Bishop. */
/* Initialise MoveChars[]. */
for (i = 0; i < MAX_CHAR; i++) {
MoveChars[i] = 0;
}
/* Files. */
for (i = 'a'; i <= 'h'; i++) {
MoveChars[i] = 1;
}
/* Ranks. */
for (i = '1'; i <= '8'; i++) {
MoveChars[i] = 1;
}
/* Upper-case pieces. */
MoveChars['K'] = 1;
MoveChars['Q'] = 1;
MoveChars['R'] = 1;
MoveChars['N'] = 1;
MoveChars['B'] = 1;
/* Lower-case pieces. */
MoveChars['k'] = 1;
MoveChars['q'] = 1;
MoveChars['r'] = 1;
MoveChars['n'] = 1;
MoveChars['b'] = 1;
/* Other u-c Dutch/German characters. */
MoveChars['D'] = 1; /* Queen. */
MoveChars['T'] = 1; /* Rook. */
MoveChars['S'] = 1; /* Knight. */
MoveChars['P'] = 1; /* Knight. */
MoveChars['L'] = 1; /* Bishop. */
/* Russian characters. */
MoveChars[RUSSIAN_KNIGHT_OR_KING] = 1; /* King and Knight. */
MoveChars[RUSSIAN_KING_SECOND_LETTER] = 1; /* King (second character). */
MoveChars[RUSSIAN_QUEEN] = 1; /* Queen. */
MoveChars[RUSSIAN_ROOK] = 1; /* Rook. */
MoveChars[RUSSIAN_BISHOP] = 1; /* Bishop. */
/* Capture and square separators. */
MoveChars['x'] = 1;
MoveChars['X'] = 1;
MoveChars[':'] = 1;
MoveChars['-'] = 1;
/* Promotion character. */
MoveChars['='] = 1;
/* Castling. */
MoveChars['O'] = 1;
MoveChars['o'] = 1;
MoveChars['0'] = 1;
/* Allow a trailing p for ep. */
MoveChars['p'] = 1;
}
/* Starting from linep in line, gather up the string until
* the closing quote. Skip over the closing quote.
* NB: This token is only used for tags, which are notoriously
* error prone, so there is some code attempting recovery
* if requested.
*/
LinePair
gather_string(char *line, unsigned char *linep)
{
LinePair resulting_line;
char ch;
unsigned len = 0;
char *str;
Boolean end_of_string = FALSE;
do {
ch = *linep++;
len++;
if (ch == '\\') {
/* Escape the next character. */
ch = *linep++;
len++;
if(ch == '\0') {
fprintf(GlobalState.logfile, "Missing escaped character in string.\n");
print_error_context(GlobalState.logfile);
end_of_string = TRUE;
}
}
else if(ch == '"' || ch == '\0') {
end_of_string = TRUE;
}
else {
/* Ordinary character. */
}
} while (!end_of_string);
if(GlobalState.fix_tag_strings && ch == '"') {
/* Look for potentially badly formatted tag strings.
* Don't assume that the second double-quote character
* is the termination point.
*/
unsigned char *lookahead = linep;
Boolean malformed = FALSE;
while(*lookahead != '\0' && ChTab[*lookahead] != TAG_END) {
TokenType tt = ChTab[*lookahead];
if(tt != WHITESPACE) {
malformed = TRUE;
}
lookahead++;
}
if(malformed) {
fprintf(GlobalState.logfile, "Malformed tag string.\n");
print_error_context(GlobalState.logfile);
lookahead--;
while(lookahead > linep && ChTab[*lookahead] == WHITESPACE) {
lookahead--;
}
if(*lookahead == '"') {
/* Likely intended end of string. */
ch = *lookahead;
len += lookahead - linep;
linep = lookahead + 1;
}
else {
/* The closing quote appears to be missing. */
lookahead++;
ch = *lookahead;
len += lookahead - linep;
linep = lookahead;
}
/* Replace any previous closing double quotes with single quotes. */
str = (char *) malloc_or_die(len + 1);
unsigned char *p = linep - len - 1;
int i = 0;
while(p < linep - 1) {
if(*p == '"') {
str[i++] = '\'';
p++;
}
else if(*p == '\\') {
str[i++] = *p++;
str[i++] = *p++;
}
else {
str[i++] = *p++;
}
}
str[i] = '\0';
}
else {
/* The last one doesn't belong in the string. */
len--;
str = (char *) malloc_or_die(len + 1);
strncpy(str, (const char *) (linep - len - 1), len);
str[len] = '\0';
}
}
else {
/* The last one doesn't belong in the string. */
len--;
/* Allocate space for the result. */
str = (char *) malloc_or_die(len + 1);
strncpy(str, (const char *) (linep - len - 1), len);
str[len] = '\0';
}
/* Store it in yylval. */
yylval.token_string = str;
/* Make sure that the string was properly terminated, by
* looking at the last character examined.
*/
if (ch == '\0') {
/* Too far. */
if (!GlobalState.skipping_current_game) {
fprintf(GlobalState.logfile, "Missing closing quote in %s\n", line);
}
if (len > 1) {
/* Move back to the null. */
linep--;
str[len - 1] = '\0';
}
}
else {
/* We have already skipped over the closing quote. */
}
resulting_line.line = line;
resulting_line.linep = linep;
resulting_line.token = STRING;
return resulting_line;
}
/*
* Is ch of the given character class?
* External access to ChTab.
*/
Boolean
is_character_class(unsigned char ch, TokenType character_class)
{
return ChTab[ch] == character_class;
}
/* Starting from linep in line, gather up a comment until
* the END_COMMENT. Skip over the END_COMMENT.
*/
static LinePair
gather_comment(char *line, unsigned char *linep)
{
LinePair resulting_line;
char ch;
unsigned len = 0;
/* The string list in which the current comment will be gathered. */
StringList *current_comment = NULL;
/* The pointer to be returned. */
CommentList *comment;
/* GlobalState.allow_nested_comments. */
comment_depth++;
do {
/* Restart a new segment. */
len = 0;
do {
ch = *linep++;
len++;
if(ch == '{') {
if(GlobalState.allow_nested_comments) {
comment_depth++;
}
}
else if(ch == '}') {
if(GlobalState.allow_nested_comments) {
if(comment_depth > 1) {
comment_depth--;
/* Prevent this terminating the outer level. */
ch = ' ';
}
}
}
else {
/* No further action. */
}
} while ((ch != '}') && (ch != '\0'));
if(ch == '}') {
comment_depth--;
}
/* The last character doesn't belong in the comment. */
len--;
if (GlobalState.keep_comments) {
char *comment_str;
unsigned const char *str = linep - len - 1;
int numchars = len;
/* Trim spaces from the end.*/
int end = numchars - 1;
while(end >= 0 && str[end] == ' ') {
end--;
}
end++;
/* Trim spaces from the start. */
int start = 0;
while(start < end && str[start] == ' ') {
start++;
}
/* Allocate space for the result. */
comment_str = (char *) malloc_or_die(end - start + 1);
strncpy(comment_str, (const char *) (str + start), end - start);
comment_str[end - start] = '\0';
current_comment = save_string_list_item(current_comment, comment_str);
}
if (ch == '\0') {
line = next_input_line(yyin);
linep = (unsigned char *) line;
}
} while ((ch != '}') && (line != NULL));
if(comment_depth > 0) {
fprintf(GlobalState.logfile, "Missing end of a nested comment.\n");
report_details(GlobalState.logfile);
}
/* Set up the structure to be returned. */
comment = (CommentList *) malloc_or_die(sizeof (*comment));
comment->comment = current_comment;
comment->next = NULL;
yylval.comment = comment;
resulting_line.line = line;
resulting_line.linep = linep;
resulting_line.token = COMMENT;
return resulting_line;
}
/* Starting from linep in line, gather up a comment until
* the END_COMMENT. Skip over the END_COMMENT.
*/
static LinePair
gather_single_line_comment(char *line, unsigned char *linep)
{
LinePair resulting_line;
if (GlobalState.keep_comments) {
/* The string list in which the current comment will be gathered. */
StringList *current_comment = NULL;
/* The pointer to be returned. */
CommentList *comment;
char *comment_str;
int numchars = strlen(line) - (linep - (unsigned char *) line);
unsigned const char *str = linep;
/* Trim spaces from the end.*/
int end = numchars - 1;
while(end >= 0 && str[end] == ' ') {
end--;
}
end++;
/* Trim spaces from the start. */
int start = 0;
while(start < end && str[start] == ' ') {
start++;
}
/* Allocate space for the result. */
comment_str = (char *) malloc_or_die(end - start + 1);
/* NB: Single-line comments are currently converted to multi-line
* comment format.
* On the off-chance that one might contain a curly bracket, 'escape'
* those characters by replacing with square brackets.
*/
char *cp = comment_str;
for(int i = start; i < end; i++) {
char ch = str[i];
if(ch == '{') {
ch = '[';
}
else if(ch == '}') {
ch = ']';
}
*cp++ = ch;
}
*cp = '\0';
current_comment = save_string_list_item(current_comment, comment_str);
/* Set up the comment structure to be returned. */
comment = (CommentList *) malloc_or_die(sizeof (*comment));
comment->comment = current_comment;
comment->next = NULL;
yylval.comment = comment;
resulting_line.token = COMMENT;
}
else {
resulting_line.token = NO_TOKEN;
}
resulting_line.line = next_input_line(yyin);
resulting_line.linep = (unsigned char *) resulting_line.line;
return resulting_line;
}
/* Remember that 0 can start 0-1 and 0-0.
* Remember that 1 can start 1-0 and 1/2.
*/
static LinePair
gather_possible_numeric(char *line, unsigned char *linep, char initial_digit)
{
LinePair resulting_line;
TokenType token = MOVE_NUMBER;
/* Keep a record of where this token started. */
const unsigned char *symbol_start = linep - 1;
if (initial_digit == '0') {
/* Could be castling or a result. */
if (strncmp((const char *) linep, "-1", 2) == 0) {
token = TERMINATING_RESULT;
save_string("0-1");
linep += 2;
}
else if (strncmp((const char *) linep, "-0-0", 4) == 0) {
token = MOVE;
save_q_castle();
linep += 4;
}
else if (strncmp((const char *) linep, "-0", 2) == 0) {
token = MOVE;
save_k_castle();
linep += 2;
}
else {
/* MOVE_NUMBER */
}
}
else if (initial_digit == '1') {
if (strncmp((const char *) linep, "-0", 2) == 0) {
token = TERMINATING_RESULT;
save_string("1-0");
linep += 2;
}
else if (strncmp((const char *) linep, "/2", 2) == 0) {
token = TERMINATING_RESULT;
linep += 2;
/* Check for the full form. */
if (strncmp((const char *) linep, "-1/2", 4) == 0) {
token = TERMINATING_RESULT;
linep += 4;
}
/* Make sure that the full form of the draw result
* is saved.
*/
save_string("1/2-1/2");
}
else {
/* MOVE_NUMBER */
}
}
else {
/* MOVE_NUMBER */
}
if (token == MOVE_NUMBER) {
/* Gather the remaining digits. */
while (isdigit((unsigned) *linep)) {
linep++;
}
}
if (token == MOVE_NUMBER) {
/* Fill out the fields of yylval. */
if (extract_yytext(symbol_start, linep)) {
yylval.move_number = 0;
(void) sscanf((const char *) yytext, "%u", &yylval.move_number);
/* Skip any trailing dots. */
while (*linep == '.') {
linep++;
}
}
else {
token = NO_TOKEN;
}
}
else {
/* TERMINATING_RESULT and MOVE have already been dealt with. */
}
resulting_line.line = line;
resulting_line.linep = linep;
resulting_line.token = token;
return resulting_line;
}
/* Look up tag_string in TagList[] and return its _TAG
* value or -1 if it isn't there.
* Although the strings are sorted initially, further
* tags identified in the source files will be appended
* without further sorting. So we cannot use a binary
* search on the list.
*/
static int
identify_tag(const char *tag_string)
{
unsigned tag_index;
for (tag_index = 0; tag_index < tag_list_length; tag_index++) {
if (strcmp(tag_string, TagList[tag_index]) == 0) {
return tag_index;
}
}
/* Not found. */
return -1;
}
/* Starting from linep in line, gather up the tag name.
* Skip over any preceding white space.
*/
LinePair
gather_tag(char *line, unsigned char *linep)
{
LinePair resulting_line;
char ch;
unsigned len = 0;
do {
/* Check for end of line while skipping white space. */
if (*linep == '\0') {
line = next_input_line(yyin);
linep = (unsigned char *) line;
}
if (line != NULL) {
while (ChTab[(unsigned) *linep] == WHITESPACE) {
linep++;
}
}
} while ((line != NULL) && (ChTab[(unsigned) *linep] == '\0'));
if (line != NULL) {
ch = *linep++;
while (isalpha((unsigned) ch) || isdigit((unsigned) ch) || (ch == '_')) {
len++;
ch = *linep++;
}
/* The last one wasn't part of the tag. */
linep--;
if (len > 0) {
int tag_item;
char *tag_string;
/* Allocate space for the result. */
tag_string = (char *) malloc_or_die(len + 1);
strncpy((char *) tag_string, (const char *) (linep - len), len);
tag_string[len] = '\0';
tag_item = identify_tag(tag_string);
if (tag_item < 0) {
tag_item = make_new_tag(tag_string);
}
if (tag_item >= 0 && ((unsigned) tag_item) < tag_list_length) {
yylval.tag_index = tag_item;
resulting_line.token = TAG;
(void) free((void *) tag_string);
}
else {
fprintf(GlobalState.logfile,
"Internal error: invalid tag index %d in gather_tag.\n",
tag_item);
exit(1);
}
}
else {
resulting_line.token = NO_TOKEN;
}
}
else {
resulting_line.token = NO_TOKEN;
}
resulting_line.line = line;
resulting_line.linep = linep;
return resulting_line;
}
static Boolean
extract_yytext(const unsigned char *symbol_start, const unsigned char *linep)
{ /* Whether the string fitted. */
Boolean Ok = TRUE;
long len = linep - symbol_start;
if (len < MAX_YYTEXT) {
strncpy((char *) yytext, (const char *) symbol_start, len);
yytext[len] = '\0';
}
else {
strncpy((char *) yytext, (const char *) symbol_start, MAX_YYTEXT);
yytext[MAX_YYTEXT] = '\0';
if (!GlobalState.skipping_current_game)
fprintf(GlobalState.logfile, "Symbol %s exceeds length of %u.\n",
yytext, MAX_YYTEXT);
Ok = FALSE;
}
return Ok;
}
/* Identify the next symbol.
* Don't take any action on EOF -- leave that to next_token.
*/
static TokenType
get_next_symbol(void)
{
static char *line = NULL;
static unsigned char *linep = NULL;
/* The token to be returned. */
TokenType token;
LinePair resulting_line;
do {
/* Remember where in line the current symbol starts. */
const unsigned char *symbol_start;
/* Clear any remaining symbol. */
*yytext = '\0';
if (line == NULL) {
line = next_input_line(yyin);
linep = (unsigned char *) line;
if (line != NULL) {
token = NO_TOKEN;
}
else {
token = EOF_TOKEN;
}
}
else {
int next_char = *linep & 0x0ff;
/* Remember where we start. */
symbol_start = linep;
linep++;
token = ChTab[next_char];
switch (token) {
case WHITESPACE:
while (ChTab[(unsigned) *linep] == WHITESPACE)
linep++;
token = NO_TOKEN;
break;
case TAG_START:
resulting_line = gather_tag(line, linep);
/* Pick up where we are now. */
line = resulting_line.line;
linep = resulting_line.linep;
token = resulting_line.token;
break;
case TAG_END:
token = NO_TOKEN;
break;
case DOUBLE_QUOTE:
resulting_line = gather_string(line, linep);
/* Pick up where we are now. */
line = resulting_line.line;
linep = resulting_line.linep;
token = resulting_line.token;
break;
case COMMENT_START:
resulting_line = gather_comment(line, linep);
/* Pick up where we are now. */
line = resulting_line.line;
linep = resulting_line.linep;
token = resulting_line.token;
break;
case COMMENT_END:
if (!GlobalState.skipping_current_game) {
fprintf(GlobalState.logfile, "Unmatched comment end on line %lu.\n", line_number);
}
token = NO_TOKEN;
break;
case NAG:
while (isdigit((unsigned) *linep)) {
linep++;
}
if (extract_yytext(symbol_start, linep)) {
save_string((const char *) yytext);
}
else {
token = NO_TOKEN;
}
break;
case ANNOTATE:
/* Don't return anything in case of error. */
token = NO_TOKEN;
while (ChTab[(unsigned) *linep] == ANNOTATE) {
linep++;
}
if (extract_yytext(symbol_start, linep)) {
switch (yytext[0]) {
case '!':
switch (yytext[1]) {
case '!':
save_string("$3");
break;
case '?':
save_string("$5");
break;
default:
save_string("$1");
break;
}
token = NAG;
break;
case '?':
switch (yytext[1]) {
case '!':
save_string("$6");
break;
case '?':
save_string("$4");
break;
default:
save_string("$2");
break;
}
token = NAG;
break;
}
}
break;
case CHECK_SYMBOL:
/* Allow ++ */
while (ChTab[(unsigned) *linep] == CHECK_SYMBOL) {
linep++;
}
break;
case DOT:
while (ChTab[(unsigned) *linep] == DOT)
linep++;
token = NO_TOKEN;
break;
case SEMICOLON:
resulting_line = gather_single_line_comment(line, linep);
/* Pick up where we are now. */
line = resulting_line.line;
linep = resulting_line.linep;
token = resulting_line.token;
break;
case PERCENT:
if(symbol_start == (const unsigned char *) line) {
/* Discard the rest of the line. */
line = next_input_line(yyin);
linep = (unsigned char *) line;
token = NO_TOKEN;
}
else {
/* Prior to v22-02 the position of % was not checked. */
}
break;
case ESCAPE:
/* @@@ What to do about this? */
if (*linep != '\0') {
linep++;
}
token = NO_TOKEN;
break;
case ALPHA:
/* Not all ALPHAs are move characters. */
if (MoveChars[next_char]) {
/* Scan through the possible move characters. */
while (MoveChars[*linep & 0x0ff]) {
linep++;
}
if (extract_yytext(symbol_start, linep)) {
/* Only classify it as a move if it
* seems to be a complete move.
*/
Boolean ok;
if (move_seems_valid(yytext)) {
save_move(yytext);
token = MOVE;
ok = TRUE;
}
else if(next_char == 'e') {
/* Consider for possible en passant notation. */
const int num_ep_strings = 2;
const char *ep[] = { "e.p.", "ep", };
int epi = 0;
while(epi < num_ep_strings &&
strncmp((const char *) symbol_start, ep[epi], strlen(ep[epi])) != 0) {
epi++;
}
if(epi < num_ep_strings) {
/* Accept. */
/* PGN has no representation for ep, so just accept without checking. */
ok = TRUE;
token = NO_TOKEN;
linep = ((unsigned char *) symbol_start) + strlen(ep[epi]);
}
else {
ok = FALSE;
}
}
else {
ok = FALSE;
}
if(! ok) {
if (!GlobalState.skipping_current_game) {
print_error_context(GlobalState.logfile);
fprintf(GlobalState.logfile,
"Unknown move text %s.\n", yytext);
}
token = NO_TOKEN;
}
}
else {
token = NO_TOKEN;
}
}
else if (next_char == 'Z' && *linep == '0') {
linep++;
save_move((const unsigned char *) NULL_MOVE_STRING);
token = MOVE;
}
else {
if (!GlobalState.skipping_current_game) {
print_error_context(GlobalState.logfile);
fprintf(GlobalState.logfile,
"Unknown character %c (Hex: %x).\n",
next_char, next_char);
fprintf(GlobalState.logfile, "%s\n", line);
unsigned pos = linep - (unsigned char *) line - 1;
for(unsigned i = 0; i < pos; i++) {
fputc(' ', GlobalState.logfile);
}
fputc('^', GlobalState.logfile);
fputc('\n', GlobalState.logfile);
}
/* Skip any sequence of them. */
while (ChTab[(unsigned) *linep] == ERROR_TOKEN) {
linep++;
}
}
break;
case DIGIT:
/* Remember that 0 can start 0-1 and 0-0.
* Remember that 1 can start 1-0 and 1/2.
*/
resulting_line = gather_possible_numeric(
line, linep, next_char);
/* Pick up where we are now. */
line = resulting_line.line;
linep = resulting_line.linep;
token = resulting_line.token;
break;
case EOF_TOKEN:
break;
case RAV_START:
RAV_level++;
break;
case RAV_END:
if (RAV_level > 0) {
RAV_level--;
}
else {
if (!GlobalState.skipping_current_game) {
print_error_context(GlobalState.logfile);
fprintf(GlobalState.logfile, "Too many ')' found.\n");
}
token = NO_TOKEN;
}
break;
case STAR:
save_string("*");
token = TERMINATING_RESULT;
break;
case DASH:
if (ChTab[(unsigned) *linep] == DASH) {
linep++;
save_move((const unsigned char *) NULL_MOVE_STRING);
token = MOVE;
}
else {
fprintf(GlobalState.logfile, "Single '-' not allowed.\n");
print_error_context(GlobalState.logfile);
token = NO_TOKEN;
}
break;
case SLASH:
/* Possible /ep annotation. */
if(linep[0] == 'e' && linep[1] == 'p') {
/* PGN has no representation for ep, so just accept without checking. */
linep += 2;
token = NO_TOKEN;
}
else {
token = NO_TOKEN;
if (!GlobalState.skipping_current_game) {
print_error_context(GlobalState.logfile);
fprintf(GlobalState.logfile,
"Single '/' not allowed.");
}
}
break;
case EOS:
/* End of the string. */
line = next_input_line(yyin);
linep = (unsigned char *) line;
token = NO_TOKEN;
break;
case ERROR_TOKEN:
if (!GlobalState.skipping_current_game) {
print_error_context(GlobalState.logfile);
fprintf(GlobalState.logfile,
"Unknown character %c (Hex: %x).\n",
next_char, next_char);
}
/* Skip any sequence of them. */
while (ChTab[(unsigned) *linep] == ERROR_TOKEN) {
linep++;
}
break;
case OPERATOR:
print_error_context(GlobalState.logfile);
fprintf(GlobalState.logfile,
"Operator in illegal context: %c.\n", *symbol_start);
/* Skip any sequence of them. */
while (ChTab[(unsigned) *linep] == OPERATOR)
linep++;
token = NO_TOKEN;
break;
default:
if (!GlobalState.skipping_current_game) {
print_error_context(GlobalState.logfile);
fprintf(GlobalState.logfile,
"Internal error: Missing case for %d on char %x.\n",
token, next_char);
}
token = NO_TOKEN;
break;
}
}
} while (token == NO_TOKEN);
return token;
}
TokenType
next_token(void)
{
TokenType token = get_next_symbol();
/* Don't call yywrap if parsing the ECO file. */
while ((token == EOF_TOKEN) && !GlobalState.parsing_ECO_file &&
!yywrap()) {
token = get_next_symbol();
}
return token;
}
/* Return TRUE if token is one to skip when looking for
* the start or end of a game.
*/
static Boolean
skip_token(TokenType token)
{
switch (token) {
case TERMINATING_RESULT:
case TAG:
case MOVE:
case EOF_TOKEN:
return FALSE;
default:
return TRUE;
}
}
/* Skip tokens until the next game looks like it is
* about to start. This is signalled by
* a tag section a terminating result from the
* previous game, or a move.
*/
TokenType
skip_to_next_game(TokenType token)
{
if (skip_token(token)) {
GlobalState.skipping_current_game = TRUE;
do {
if (token == COMMENT) {
/* Free the space. */
if ((yylval.comment != NULL) &&
(yylval.comment->comment != NULL)) {
free_string_list(yylval.comment->comment);
free((void *) yylval.comment);
yylval.comment = NULL;
}
}
token = next_token();
} while (skip_token(token));
GlobalState.skipping_current_game = FALSE;
}
return token;
}
/* Save castling moves in a standard way. */
static void
save_q_castle(void)
{
save_move((const unsigned char *) "O-O-O");
}
/* Save castling moves in a standard way. */
static void
save_k_castle(void)
{
save_move((const unsigned char *) "O-O");
}
/* Make a copy of the matched text of the move. */
static void
save_move(const unsigned char *move)
{
if(strlen((char *) move) > MAX_MOVE_LEN) {
fprintf(stderr, "Internal error: cannot handle %s (too long)\n", move);
exit(1);
}
/* Decode the move into its components. */
yylval.move_details = decode_move(move);
/* Remember the last move. */
strcpy((char *) last_move, (const char *) move);
}
void
restart_lex_for_new_game(void)
{
*last_move = '\0';
RAV_level = 0;
}
/* Make it possible to read multiple input files.
* These are held in list_of_files. The list
* is built up from the program's arguments.
*/
static int current_file_num = 0;
/* Keep track of the list of PGN files. These will either be the
* remaining arguments once flags have been dealt with, or
* those read from -c and -f arguments.
*/
static FILE_LIST list_of_files = {
(const char **) NULL,
(SourceFileType *) NULL,
0, 0
};
/* Return the index number of the current input file in list_of_files. */
unsigned
current_file_number(void)
{
return current_file_num;
}
/* Buffer I/O because it does seem to make a difference to the
* processing speed of games.
*/
/* It doesn't appear to be necessary for this to be
* particularly big to make a significant difference
* to I/O efficiency.
*/
#define INPUT_BUFFER_LEN 500
static size_t input_buffer_index = 0;
static size_t input_buffer_limit = 0;
static char input_buffer[INPUT_BUFFER_LEN];
/* Fill the input buffer to its limit, if possible. */
static void fill_input_buffer(FILE *fpin)
{
if(! feof(fpin)) {
input_buffer_limit = fread(input_buffer, sizeof(*input_buffer), INPUT_BUFFER_LEN, fpin);
}
else {
input_buffer_limit = 0;
}
input_buffer_index = 0;
}
/* Return the next input character, as an int to
* support EOF.
*/
static int get_next_char(FILE *fpin)
{
if(input_buffer_index == input_buffer_limit) {
fill_input_buffer(fpin);
}
if(input_buffer_index != input_buffer_limit) {
return input_buffer[input_buffer_index++];
}
else {
return EOF;
}
}
/* Unget the previous input character. */
static void unget_char(int c, FILE *fpin)
{
if(input_buffer_index > 0) {
if(c != EOF) {
input_buffer_index--;
}
}
else {
fprintf(GlobalState.logfile, "Internal error: unget_char(%c)\n", c);
report_details(GlobalState.logfile);
exit(1);
}
}
/* Read a single line of input. */
#define INIT_LINE_LENGTH 100
#define LINE_INCREMENT 100
char *read_line(FILE *fpin)
{
char *line = NULL;
unsigned len = 0;
unsigned max_length;
int ch;
ch = get_next_char(fpin);
if (ch != EOF) {
line = (char *) malloc_or_die(INIT_LINE_LENGTH + 1);
max_length = INIT_LINE_LENGTH;
while ((ch != '\n') && (ch != '\r') && (ch != EOF)) {
/* Another character to add. */
if (len == max_length) {
line = (char *) realloc_or_die((void *) line,
max_length + LINE_INCREMENT + 1);
if (line == NULL) {
return NULL;
}
max_length += LINE_INCREMENT;
}
line[len] = ch;
len++;
ch = get_next_char(fpin);
}
line[len] = '\0';
if (ch == '\r') {
/* Try to avoid double counting lines in dos-format files. */
ch = get_next_char(fpin);
if (ch != '\n' && ch != EOF) {
unget_char(ch, fpin);
}
}
}
return line;
}
/* Read a list of lines from fp. These are the names of files
* to be added to the existing list_of_files.
* list_of_files.list must have a (char *)NULL on the end.
*/
void
add_filename_list_from_file(FILE *fp, SourceFileType file_type)
{
if ((list_of_files.files == NULL) || (list_of_files.max_files == 0)) {
/* Allocate an initial number of pointers for the lines.
* This must always include an extra one for terminating NULL.
*/
list_of_files.files = (const char **) malloc_or_die((INIT_LIST_SPACE + 1) *
sizeof (const char *));
list_of_files.file_type = (SourceFileType *) malloc_or_die((INIT_LIST_SPACE + 1) *
sizeof (SourceFileType));
list_of_files.max_files = INIT_LIST_SPACE;
list_of_files.num_files = 0;
}
if (list_of_files.files != NULL) {
/* Find the first line. */
char *line = read_line(fp);
while (line != NULL) {
if (non_blank_line(line)) {
add_filename_to_source_list(line, file_type);
}
else {
(void) free((void *) line);
}
line = read_line(fp);
}
}
}
void
add_filename_to_source_list(const char *filename, SourceFileType file_type)
{ /* Where to put it. */
unsigned location = list_of_files.num_files;
if (access(filename, R_OK) != 0) {
fprintf(GlobalState.logfile, "Unable to find %s\n", filename);
exit(1);
}
else {
/* Ok. */
}
/* See if there is room. */
if (list_of_files.num_files == list_of_files.max_files) {
/* There isn't, so increase the amount of available space,
* ensuring that there is always an extra slot for the terminating
* NULL.
*/
if ((list_of_files.files == NULL) || (list_of_files.max_files == 0)) {
/* Allocate an initial number of pointers for the lines.
* This must always include an extra one for terminating NULL.
*/
list_of_files.files = (const char **) malloc_or_die((INIT_LIST_SPACE + 1) *
sizeof (const char *));
list_of_files.file_type = (SourceFileType *)
malloc_or_die((INIT_LIST_SPACE + 1) *
sizeof (SourceFileType));
list_of_files.max_files = INIT_LIST_SPACE;
list_of_files.num_files = 0;
}
else {
list_of_files.files = (const char **) realloc_or_die((void *) list_of_files.files,
(list_of_files.max_files + MORE_LIST_SPACE + 1) *
sizeof (const char *));
list_of_files.file_type = (SourceFileType *)
realloc_or_die((void *) list_of_files.file_type,
(list_of_files.max_files + MORE_LIST_SPACE + 1) *
sizeof (SourceFileType));
list_of_files.max_files += MORE_LIST_SPACE;
if ((list_of_files.files == NULL) && (list_of_files.file_type == NULL)) {
perror("");
abort();
}
}
}
/* We know that there is space. Ensure that CHECKFILEs are all
* stored before NORMALFILEs.
*/
if (file_type == CHECKFILE) {
for (location = 0; (location < list_of_files.num_files) &&
(list_of_files.file_type[location] == CHECKFILE); location++) {
/* Do nothing. */
}
if (location < list_of_files.num_files) {
/* Put the new one here.
* Move the rest down.
*/
unsigned j;
for (j = list_of_files.num_files; j > location; j--) {
list_of_files.files[j] = list_of_files.files[j - 1];
list_of_files.file_type[j] = list_of_files.file_type[j - 1];
}
}
}
list_of_files.files[location] = copy_string(filename);
list_of_files.file_type[location] = file_type;
list_of_files.num_files++;
/* Keep the list properly terminated. */
list_of_files.files[list_of_files.num_files] = (char *) NULL;
}
/* Use infile as the input source. */
static Boolean
open_input(const char *infile)
{
yyin = fopen(infile, "rb");
if (yyin != NULL) {
GlobalState.current_input_file = infile;
if (GlobalState.verbosity > 1) {
fprintf(GlobalState.logfile, "Processing %s\n",
GlobalState.current_input_file);
}
}
return yyin != NULL;
}
/* Simple interface to open_input for the ECO file. */
Boolean
open_eco_file(const char *eco_file)
{
return open_input(eco_file);
}
/* Open the input file whose number is the argument. */
static Boolean
open_input_file(int file_number)
{
/* Depending on the type of file, ensure that the
* current_file_type is set correctly.
*/
if (open_input(list_of_files.files[file_number])) {
GlobalState.current_file_type = list_of_files.file_type[file_number];
return TRUE;
}
else {
return FALSE;
}
}
/* Open the first input file. */
Boolean
open_first_file(void)
{
Boolean ok = TRUE;
if (list_of_files.num_files == 0) {
/* Use standard input. */
yyin = stdin;
GlobalState.current_input_file = "stdin";
/* @@@ Should this be set?
GlobalState.current_file_type = NORMALFILE;
*/
if (GlobalState.verbosity > 1) {
fprintf(GlobalState.logfile, "Processing %s\n",
GlobalState.current_input_file);
}
}
else if (open_input_file(0)) {
}
else {
fprintf(GlobalState.logfile,
"Unable to open the PGN file: %s\n", input_file_name(0));
ok = FALSE;
}
return ok;
}
/* Return the name of the file corresponding to the given
* file number.
*/
const char *
input_file_name(unsigned file_number)
{
if (file_number >= list_of_files.num_files) {
return NULL;
}
else {
return list_of_files.files[file_number];
}
}
/* Give some error information. */
void
print_error_context(FILE *fp)
{
if (GlobalState.current_input_file != NULL) {
fprintf(fp, "File %s: ", GlobalState.current_input_file);
}
fprintf(fp, "Line number: %lu\n", line_number);
}
/* Make the given str accessible. */
static void
save_string(const char *str)
{
const size_t len = strlen(str);
char *token;
token = (char *) malloc_or_die(len + 1);
strcpy(token, str);
yylval.token_string = token;
}
/* Return the next line of input from fp. */
char *
next_input_line(FILE *fp)
{ /* Retain each line in turn, so as to be able to free it. */
static char *line = NULL;
if (line != NULL) {
(void) free((void *) line);
}
line = read_line(fp);
if (line != NULL) {
line_number++;
}
return line;
}
/* Handle the end of a file. */
int
yywrap(void)
{
int time_to_exit;
/* Beware of this being called in inappropriate circumstances. */
if (list_of_files.files == NULL) {
/* There are no files. */
time_to_exit = 1;
}
else if (input_file_name(current_file_num) == NULL) {
/* There was no last file! */
time_to_exit = 1;
}
else {
/* Close the input files. */
terminate_input();
/* See if there is another. */
current_file_num++;
if (input_file_name(current_file_num) == NULL) {
/* We have processed the last file. */
time_to_exit = 1;
}
else if (!open_input_file(current_file_num)) {
fprintf(GlobalState.logfile, "Unable to open the PGN file: %s\n",
input_file_name(current_file_num));
time_to_exit = 1;
}
else {
/* Ok, we opened it. */
time_to_exit = 0;
/* Set everything up for a new file. */
/* Depending on the type of file, ensure that the
* current_file_type is set correctly.
*/
GlobalState.current_file_type =
list_of_files.file_type[current_file_num];
restart_lex_for_new_game();
games_in_file = 0;
reset_line_number();
}
}
return time_to_exit;
}
/* Return the current line number. */
unsigned long
get_line_number(void)
{
return line_number;
}
/* Reset the file's line number. */
void
reset_line_number(void)
{
line_number = 0;
}
static void
terminate_input(void)
{
if ((yyin != stdin) && (yyin != NULL)) {
(void) fclose(yyin);
yyin = NULL;
}
}