add pgn-extract 22.11
This commit is contained in:
604
pgn-extract/hashing.c
Normal file
604
pgn-extract/hashing.c
Normal file
@@ -0,0 +1,604 @@
|
||||
/*
|
||||
* This file is part of pgn-extract: a Portable Game Notation (PGN) extractor.
|
||||
* Copyright (C) 1994-2022 David J. Barnes
|
||||
*
|
||||
* pgn-extract is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* pgn-extract is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with pgn-extract. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* David J. Barnes may be contacted as d.j.barnes@kent.ac.uk
|
||||
* https://www.cs.kent.ac.uk/people/staff/djb/
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#if defined(__BORLANDC__) || defined(_MSC_VER)
|
||||
/* For unlink() */
|
||||
#include <io.h>
|
||||
#else
|
||||
/* For unlink() */
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "bool.h"
|
||||
#include "mymalloc.h"
|
||||
#include "defs.h"
|
||||
#include "typedef.h"
|
||||
#include "tokens.h"
|
||||
#include "taglist.h"
|
||||
#include "lex.h"
|
||||
#include "hashing.h"
|
||||
#include "zobrist.h"
|
||||
|
||||
/* Routines, similar in nature to those in apply.c
|
||||
* to implement a duplicate hash-table lookup using
|
||||
* an external file, rather than malloc'd memory.
|
||||
* The only limit should be a file system limit.
|
||||
* NB: Using an external, virtual file seems obsolete now.
|
||||
*
|
||||
* This version should be slightly more accurate than
|
||||
* the alternative because the final_ and cumulative_
|
||||
* hash values are both stored, rather than the XOR
|
||||
* of them.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The name of the file used.
|
||||
* This is overwritten each time, and removed on normal
|
||||
* program exit.
|
||||
*/
|
||||
static char VIRTUAL_FILE[] = "virtual.tmp";
|
||||
|
||||
/* Define the size of the hash table.
|
||||
*/
|
||||
#define LOG_TABLE_SIZE 100003
|
||||
|
||||
/* Define a table to hold hash values of the extracted games.
|
||||
* This is used to enable duplicate detection.
|
||||
*/
|
||||
typedef struct {
|
||||
/* Record the file offset of the first and last entries
|
||||
* for an index. (head == -1) => empty.
|
||||
*/
|
||||
long head, tail;
|
||||
} LogHeaderEntry;
|
||||
|
||||
/* If use_virtual_hash_table */
|
||||
static LogHeaderEntry *VirtualLogTable = NULL;
|
||||
|
||||
/* Define a table to hold hash values of the extracted games.
|
||||
* This is used to enable duplicate detection when not using
|
||||
* the virtual hash table.
|
||||
*/
|
||||
static HashLog **LogTable = NULL;
|
||||
|
||||
/* Define a type to hold hash values of interest.
|
||||
* This is used both to aid in duplicate detection
|
||||
* and in finding positional variations.
|
||||
*/
|
||||
typedef struct VirtualHashLog {
|
||||
/* Store the final position hash value and
|
||||
* the cumulative hash value for a game.
|
||||
*/
|
||||
HashCode final_hash_value, cumulative_hash_value;
|
||||
/* Record the file list index for the file this game was first found in. */
|
||||
int file_number;
|
||||
/* Record the file offset of the next element
|
||||
* in this list. -1 => end-of-list.
|
||||
*/
|
||||
long next;
|
||||
} VirtualHashLog;
|
||||
|
||||
static FILE *hash_file = NULL;
|
||||
|
||||
static const char *previous_virtual_occurance(Game game_details);
|
||||
|
||||
/*
|
||||
* Check whether the position counts indicate a desired repetition.
|
||||
* If we are checking for repetition return TRUE if it does and FALSE otherwise.
|
||||
* If we are not then return TRUE.
|
||||
*/
|
||||
Boolean check_for_only_repetition(PositionCount *position_counts)
|
||||
{
|
||||
if (GlobalState.check_for_repetition > 0) {
|
||||
PositionCount *entry = position_counts;
|
||||
while (entry != NULL && entry->count < GlobalState.check_for_repetition) {
|
||||
entry = entry->next;
|
||||
}
|
||||
return entry != NULL;
|
||||
}
|
||||
else {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Encode the castling rights of the current board
|
||||
* as a 4-bit pattern.
|
||||
*/
|
||||
static unsigned short
|
||||
encode_castling_rights(const Board *board)
|
||||
{
|
||||
unsigned short rights = 0;
|
||||
if(board->WKingCastle) {
|
||||
rights |= 0x08;
|
||||
}
|
||||
if(board->WQueenCastle) {
|
||||
rights |= 0x04;
|
||||
}
|
||||
if(board->BKingCastle) {
|
||||
rights |= 0x02;
|
||||
}
|
||||
if(board->BQueenCastle) {
|
||||
rights |= 0x01;
|
||||
}
|
||||
return rights;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return TRUE if the position on board matches the entry details
|
||||
* for the purposes of position repetition matches:
|
||||
* + Same board position (based on a hash value)
|
||||
* + Same castling rights.
|
||||
* + Same en passant status (i.e., no ep possible).
|
||||
* + Same player to move.
|
||||
*/
|
||||
static Boolean
|
||||
position_matches(PositionCount *entry, const Board *board)
|
||||
{
|
||||
if(board->weak_hash_value != entry->hash_value) {
|
||||
return FALSE;
|
||||
}
|
||||
else if(board->to_move != entry->to_move) {
|
||||
return FALSE;
|
||||
}
|
||||
else if(encode_castling_rights(board) != entry->castling_rights) {
|
||||
return FALSE;
|
||||
}
|
||||
else {
|
||||
if(board->EnPassant) {
|
||||
return board->ep_rank == entry->ep_rank && board->ep_col == entry->ep_col;
|
||||
}
|
||||
else {
|
||||
return entry->ep_rank == '\0';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Add hash_value as a position in the current game.
|
||||
* Return the number of times this position has occurred.
|
||||
* NB: The assumption is that position_counts is not NULL.
|
||||
*/
|
||||
unsigned
|
||||
update_position_counts(PositionCount *position_counts, const Board *board)
|
||||
{
|
||||
//fprintf(stderr, "U: %d,%d\n", board->ep_rank, board->ep_col);
|
||||
PositionCount *entry = position_counts;
|
||||
if (position_counts == NULL) {
|
||||
/* Don't try to match in variations. */
|
||||
return 0;
|
||||
}
|
||||
/* Try to find an existing entry. */
|
||||
while (entry != NULL && !position_matches(entry, board)) {
|
||||
entry = entry->next;
|
||||
}
|
||||
if (entry == NULL) {
|
||||
/* New position. */
|
||||
entry = new_position_count_list(board);
|
||||
/* Insert just after the head of the list. */
|
||||
entry->next = position_counts->next;
|
||||
position_counts->next = entry;
|
||||
}
|
||||
else {
|
||||
/* Increment the count. */
|
||||
entry->count++;
|
||||
}
|
||||
return entry->count;
|
||||
}
|
||||
|
||||
/*
|
||||
* Free the list of position counts.
|
||||
*/
|
||||
void
|
||||
free_position_count_list(PositionCount *position_counts)
|
||||
{
|
||||
PositionCount *entry = position_counts;
|
||||
while (entry != NULL) {
|
||||
PositionCount *next = entry->next;
|
||||
(void) free((void *) entry);
|
||||
entry = next;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a new position count list.
|
||||
* This will have a single entry at its head.
|
||||
*/
|
||||
PositionCount *
|
||||
new_position_count_list(const Board *board)
|
||||
{
|
||||
PositionCount *head = (PositionCount *) malloc_or_die(sizeof (*head));
|
||||
head->hash_value = board->weak_hash_value;
|
||||
head->to_move = board->to_move;
|
||||
head->castling_rights = encode_castling_rights(board);
|
||||
if(board->EnPassant) {
|
||||
head->ep_rank = board->ep_rank;
|
||||
head->ep_col = board->ep_col;
|
||||
}
|
||||
else {
|
||||
head->ep_rank = '\0';
|
||||
head->ep_col = '\0';
|
||||
}
|
||||
head->count = 1;
|
||||
head->next = NULL;
|
||||
return head;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return an identical copy of the given list.
|
||||
*/
|
||||
PositionCount *copy_position_count_list(PositionCount *original)
|
||||
{
|
||||
PositionCount *copy = NULL;
|
||||
PositionCount *tail = NULL;
|
||||
while (original != NULL) {
|
||||
PositionCount *entry = (PositionCount *) malloc_or_die(sizeof (*entry));
|
||||
entry->hash_value = original->hash_value;
|
||||
entry->to_move = original->to_move;
|
||||
entry->castling_rights = original->castling_rights;
|
||||
entry->count = original->count;
|
||||
entry->next = NULL;
|
||||
|
||||
if (copy == NULL) {
|
||||
copy = entry;
|
||||
}
|
||||
else {
|
||||
tail->next = entry;
|
||||
}
|
||||
tail = entry;
|
||||
original = original->next;
|
||||
}
|
||||
return copy;
|
||||
}
|
||||
|
||||
/* Determine which table to initialise, depending
|
||||
* on whether use_virtual_hash_table is set or not.
|
||||
*/
|
||||
void
|
||||
init_duplicate_hash_table(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (GlobalState.use_virtual_hash_table) {
|
||||
VirtualLogTable = (LogHeaderEntry *)
|
||||
malloc_or_die(LOG_TABLE_SIZE * sizeof (*VirtualLogTable));
|
||||
for (i = 0; i < LOG_TABLE_SIZE; i++) {
|
||||
VirtualLogTable[i].head = VirtualLogTable[i].tail = -1;
|
||||
}
|
||||
hash_file = fopen(VIRTUAL_FILE, "w+b");
|
||||
if (hash_file == NULL) {
|
||||
fprintf(GlobalState.logfile, "Unable to open %s\n",
|
||||
VIRTUAL_FILE);
|
||||
}
|
||||
}
|
||||
else {
|
||||
LogTable = (HashLog**) malloc_or_die(LOG_TABLE_SIZE * sizeof (*LogTable));
|
||||
for (i = 0; i < LOG_TABLE_SIZE; i++) {
|
||||
LogTable[i] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Close and remove the temporary file if in use. */
|
||||
void
|
||||
clear_duplicate_hash_table(void)
|
||||
{
|
||||
if (GlobalState.use_virtual_hash_table) {
|
||||
if (hash_file != NULL) {
|
||||
(void) fclose(hash_file);
|
||||
unlink(VIRTUAL_FILE);
|
||||
hash_file = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Retrieve a duplicate table entry from the hash file. */
|
||||
static int
|
||||
retrieve_virtual_entry(long ix, VirtualHashLog *entry)
|
||||
{
|
||||
if (hash_file == NULL) {
|
||||
return 0;
|
||||
}
|
||||
else if (fseek(hash_file, ix, SEEK_SET) != 0) {
|
||||
fprintf(GlobalState.logfile,
|
||||
"Fseek error to %ld in retrieve_virtual_entry\n", ix);
|
||||
return 0;
|
||||
}
|
||||
else if (fread((void *) entry, sizeof (*entry), 1, hash_file) != 1) {
|
||||
fprintf(GlobalState.logfile,
|
||||
"Fread error from %ld in retrieve_virtual_entry\n", ix);
|
||||
return 0;
|
||||
}
|
||||
else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Write a duplicate table entry to the hash file. */
|
||||
static int
|
||||
write_virtual_entry(long where, const VirtualHashLog *entry)
|
||||
{
|
||||
if (fseek(hash_file, where, SEEK_SET) != 0) {
|
||||
fprintf(GlobalState.logfile,
|
||||
"Fseek error to %ld in write_virtual_entry\n", where);
|
||||
return 0;
|
||||
}
|
||||
else if (fwrite((void *) entry, sizeof (*entry), 1, hash_file) != 1) {
|
||||
fprintf(GlobalState.logfile,
|
||||
"Fwrite error from %ld in write_virtual_entry\n", where);
|
||||
fflush(hash_file);
|
||||
return 0;
|
||||
}
|
||||
else {
|
||||
/* Written ok. */
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Return the name of the original file if it looks like we
|
||||
* have met the moves in game_details before, otherwise return
|
||||
* NULL. A match is assumed to be so if both
|
||||
* the final_ and cumulative_ hash values in game_details
|
||||
* are already present in VirtualLogTable.
|
||||
*/
|
||||
static const char *
|
||||
previous_virtual_occurance(Game game_details)
|
||||
{
|
||||
unsigned ix = game_details.final_hash_value % LOG_TABLE_SIZE;
|
||||
VirtualHashLog entry;
|
||||
Boolean duplicate = FALSE;
|
||||
const char *original_filename = NULL;
|
||||
|
||||
|
||||
/* Are we keeping this information? */
|
||||
if (GlobalState.suppress_duplicates || GlobalState.suppress_originals ||
|
||||
GlobalState.duplicate_file != NULL) {
|
||||
if (VirtualLogTable[ix].head < 0l) {
|
||||
/* First occurrence. */
|
||||
}
|
||||
else {
|
||||
int keep_going =
|
||||
retrieve_virtual_entry(VirtualLogTable[ix].head, &entry);
|
||||
|
||||
while (keep_going && !duplicate) {
|
||||
if ((entry.final_hash_value == game_details.final_hash_value) &&
|
||||
(entry.cumulative_hash_value == game_details.cumulative_hash_value)) {
|
||||
/* We have a match.
|
||||
* Determine where it first occured.
|
||||
*/
|
||||
original_filename = input_file_name(entry.file_number);
|
||||
duplicate = TRUE;
|
||||
}
|
||||
else if (entry.next >= 0l) {
|
||||
keep_going = retrieve_virtual_entry(entry.next, &entry);
|
||||
}
|
||||
else {
|
||||
keep_going = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!duplicate) {
|
||||
/* Write an entry for it. */
|
||||
/* Where to write the next VirtualHashLog entry. */
|
||||
static long next_free_entry = 0l;
|
||||
|
||||
/* Avoid valgrind error when writing unset bytes that
|
||||
* are part of the structure padding.
|
||||
*/
|
||||
memset((void *) &entry, 0, sizeof(entry));
|
||||
/* Store the XOR of the two hash values. */
|
||||
entry.final_hash_value = game_details.final_hash_value;
|
||||
entry.cumulative_hash_value = game_details.cumulative_hash_value;
|
||||
entry.file_number = current_file_number();
|
||||
entry.next = -1l;
|
||||
|
||||
/* Write out these details. */
|
||||
if (write_virtual_entry(next_free_entry, &entry)) {
|
||||
long where_written = next_free_entry;
|
||||
/* Move on ready for next time. */
|
||||
next_free_entry += sizeof (entry);
|
||||
|
||||
/* Now update the index table. */
|
||||
if (VirtualLogTable[ix].head < 0l) {
|
||||
/* First occurrence. */
|
||||
VirtualLogTable[ix].head =
|
||||
VirtualLogTable[ix].tail = where_written;
|
||||
}
|
||||
else {
|
||||
VirtualHashLog tail;
|
||||
|
||||
if (retrieve_virtual_entry(VirtualLogTable[ix].tail, &tail)) {
|
||||
tail.next = where_written;
|
||||
(void) write_virtual_entry(VirtualLogTable[ix].tail, &tail);
|
||||
/* Store the new tail address. */
|
||||
VirtualLogTable[ix].tail = where_written;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return original_filename;
|
||||
}
|
||||
|
||||
/* Return the name of the original file if it looks like we
|
||||
* have met the moves in game_details before, otherwise return
|
||||
* NULL.
|
||||
* For non-fuzzy comparison, a match is assumed to be so if both
|
||||
* final_ and cumulative_ hash values are already present
|
||||
* as a pair in LogTable.
|
||||
* Fuzzy matches depend on the match depth and do not use the
|
||||
* cumulative hash value.
|
||||
*/
|
||||
const char *
|
||||
previous_occurance(Game game_details, unsigned plycount)
|
||||
{
|
||||
const char *original_filename = NULL;
|
||||
if (GlobalState.use_virtual_hash_table) {
|
||||
original_filename = previous_virtual_occurance(game_details);
|
||||
}
|
||||
else {
|
||||
/* Are we keeping this information? */
|
||||
if (GlobalState.suppress_duplicates ||
|
||||
GlobalState.suppress_originals ||
|
||||
GlobalState.fuzzy_match_duplicates ||
|
||||
GlobalState.duplicate_file != NULL) {
|
||||
Boolean duplicate = FALSE;
|
||||
// Entry index.
|
||||
unsigned ix;
|
||||
HashLog *entry;
|
||||
|
||||
ix = game_details.final_hash_value % LOG_TABLE_SIZE;
|
||||
entry = LogTable[ix];
|
||||
/* Check for non-fuzzy matches first. */
|
||||
while (entry != NULL && !duplicate) {
|
||||
if (entry->final_hash_value == game_details.final_hash_value &&
|
||||
entry->cumulative_hash_value == game_details.cumulative_hash_value) {
|
||||
/* An exact match. */
|
||||
duplicate = TRUE;
|
||||
/* Determine where it first occurred. */
|
||||
original_filename = input_file_name(entry->file_number);
|
||||
}
|
||||
else {
|
||||
entry = entry->next;
|
||||
}
|
||||
}
|
||||
if (!duplicate && GlobalState.fuzzy_match_duplicates) {
|
||||
ix = game_details.fuzzy_duplicate_hash % LOG_TABLE_SIZE;
|
||||
entry = LogTable[ix];
|
||||
while (entry != NULL && !duplicate) {
|
||||
if (GlobalState.fuzzy_match_depth == 0 &&
|
||||
entry->final_hash_value == game_details.final_hash_value) {
|
||||
/* Accept positional match at the end of the game. */
|
||||
duplicate = TRUE;
|
||||
}
|
||||
else {
|
||||
/* Need to check at the fuzzy_match_depth. */
|
||||
if (entry->final_hash_value == game_details.fuzzy_duplicate_hash) {
|
||||
duplicate = TRUE;
|
||||
}
|
||||
}
|
||||
if (duplicate) {
|
||||
/* We have a match.
|
||||
* Determine where it first occurred.
|
||||
*/
|
||||
original_filename = input_file_name(entry->file_number);
|
||||
}
|
||||
else {
|
||||
entry = entry->next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!duplicate) {
|
||||
/* First occurrence, so add it to the log. */
|
||||
entry = (HashLog *) malloc_or_die(sizeof (*entry));
|
||||
|
||||
if (!GlobalState.fuzzy_match_duplicates) {
|
||||
/* Store the two hash values. */
|
||||
entry->final_hash_value = game_details.final_hash_value;
|
||||
entry->cumulative_hash_value = game_details.cumulative_hash_value;
|
||||
}
|
||||
else if (GlobalState.fuzzy_match_depth > 0 &&
|
||||
plycount >= GlobalState.fuzzy_match_depth) {
|
||||
/* Store just the hash value from the fuzzy depth. */
|
||||
entry->final_hash_value = game_details.fuzzy_duplicate_hash;
|
||||
entry->cumulative_hash_value = 0;
|
||||
}
|
||||
else {
|
||||
/* Store the two hash values. */
|
||||
entry->final_hash_value = game_details.final_hash_value;
|
||||
entry->cumulative_hash_value = game_details.cumulative_hash_value;
|
||||
}
|
||||
entry->file_number = current_file_number();
|
||||
/* Link it into the head at this index. */
|
||||
entry->next = LogTable[ix];
|
||||
LogTable[ix] = entry;
|
||||
}
|
||||
/* Without a filename, suppressing duplicates on stdin does not work. */
|
||||
if(duplicate && original_filename == NULL) {
|
||||
original_filename = "_stdin_";
|
||||
}
|
||||
}
|
||||
}
|
||||
return original_filename;
|
||||
}
|
||||
|
||||
/* Define a table to hold the zobrist/polyglot hash codes of starting positions.
|
||||
* Size should be a prime number for collision avoidance.
|
||||
*/
|
||||
#define SETUP_TABLE_SIZE 31957
|
||||
static HashLog *polyglot_codes_of_interest[SETUP_TABLE_SIZE];
|
||||
/* Whether the standard starting position has been seen in the
|
||||
* games processed. This avoids having to generate the zobrist
|
||||
* hash for all games that have no Setup/FEN tags.
|
||||
*/
|
||||
static Boolean standard_start_seen = FALSE;
|
||||
|
||||
/* Check whether the starting position of the given game
|
||||
* has been met before.
|
||||
* Return FALSE if duplicate starting positions are not being
|
||||
* deleted or if the current position has not been met before.
|
||||
* Otherwise return TRUE.
|
||||
*/
|
||||
Boolean check_duplicate_setup(const Game *game_details)
|
||||
{
|
||||
Boolean keep = TRUE;
|
||||
if(GlobalState.delete_same_setup) {
|
||||
if(game_details->tags[FEN_TAG] != NULL) {
|
||||
uint64_t hash = generate_zobrist_hash_from_fen(game_details->tags[FEN_TAG]);
|
||||
unsigned ix = hash % SETUP_TABLE_SIZE;
|
||||
Boolean found = FALSE;
|
||||
for (HashLog *entry = polyglot_codes_of_interest[ix]; !found && (entry != NULL);
|
||||
entry = entry->next) {
|
||||
/* We can test against just the position value. */
|
||||
if (entry->final_hash_value == hash) {
|
||||
found = TRUE;
|
||||
}
|
||||
}
|
||||
if(found) {
|
||||
keep = FALSE;
|
||||
}
|
||||
else {
|
||||
HashLog *entry = (HashLog *) malloc_or_die(sizeof (*entry));
|
||||
/* We don't include the cumulative hash value as this
|
||||
* is the starting position.
|
||||
*/
|
||||
entry->cumulative_hash_value = 0;
|
||||
entry->final_hash_value = hash;
|
||||
/* Link it into the head at this index. */
|
||||
entry->next = polyglot_codes_of_interest[ix];
|
||||
polyglot_codes_of_interest[ix] = entry;
|
||||
}
|
||||
}
|
||||
else if(standard_start_seen) {
|
||||
keep = FALSE;
|
||||
}
|
||||
else {
|
||||
standard_start_seen = TRUE;
|
||||
}
|
||||
}
|
||||
return keep;
|
||||
}
|
||||
|
Reference in New Issue
Block a user