Files
chess-games/pgn-extract/hashing.c
2024-01-22 07:30:05 +01:00

605 lines
19 KiB
C

/*
* This file is part of pgn-extract: a Portable Game Notation (PGN) extractor.
* Copyright (C) 1994-2022 David J. Barnes
*
* pgn-extract is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* pgn-extract is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with pgn-extract. If not, see <http://www.gnu.org/licenses/>.
*
* David J. Barnes may be contacted as d.j.barnes@kent.ac.uk
* https://www.cs.kent.ac.uk/people/staff/djb/
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#if defined(__BORLANDC__) || defined(_MSC_VER)
/* For unlink() */
#include <io.h>
#else
/* For unlink() */
#include <unistd.h>
#endif
#include "bool.h"
#include "mymalloc.h"
#include "defs.h"
#include "typedef.h"
#include "tokens.h"
#include "taglist.h"
#include "lex.h"
#include "hashing.h"
#include "zobrist.h"
/* Routines, similar in nature to those in apply.c
* to implement a duplicate hash-table lookup using
* an external file, rather than malloc'd memory.
* The only limit should be a file system limit.
* NB: Using an external, virtual file seems obsolete now.
*
* This version should be slightly more accurate than
* the alternative because the final_ and cumulative_
* hash values are both stored, rather than the XOR
* of them.
*/
/*
* The name of the file used.
* This is overwritten each time, and removed on normal
* program exit.
*/
static char VIRTUAL_FILE[] = "virtual.tmp";
/* Define the size of the hash table.
*/
#define LOG_TABLE_SIZE 100003
/* Define a table to hold hash values of the extracted games.
* This is used to enable duplicate detection.
*/
typedef struct {
/* Record the file offset of the first and last entries
* for an index. (head == -1) => empty.
*/
long head, tail;
} LogHeaderEntry;
/* If use_virtual_hash_table */
static LogHeaderEntry *VirtualLogTable = NULL;
/* Define a table to hold hash values of the extracted games.
* This is used to enable duplicate detection when not using
* the virtual hash table.
*/
static HashLog **LogTable = NULL;
/* Define a type to hold hash values of interest.
* This is used both to aid in duplicate detection
* and in finding positional variations.
*/
typedef struct VirtualHashLog {
/* Store the final position hash value and
* the cumulative hash value for a game.
*/
HashCode final_hash_value, cumulative_hash_value;
/* Record the file list index for the file this game was first found in. */
int file_number;
/* Record the file offset of the next element
* in this list. -1 => end-of-list.
*/
long next;
} VirtualHashLog;
static FILE *hash_file = NULL;
static const char *previous_virtual_occurance(Game game_details);
/*
* Check whether the position counts indicate a desired repetition.
* If we are checking for repetition return TRUE if it does and FALSE otherwise.
* If we are not then return TRUE.
*/
Boolean check_for_only_repetition(PositionCount *position_counts)
{
if (GlobalState.check_for_repetition > 0) {
PositionCount *entry = position_counts;
while (entry != NULL && entry->count < GlobalState.check_for_repetition) {
entry = entry->next;
}
return entry != NULL;
}
else {
return TRUE;
}
}
/*
* Encode the castling rights of the current board
* as a 4-bit pattern.
*/
static unsigned short
encode_castling_rights(const Board *board)
{
unsigned short rights = 0;
if(board->WKingCastle) {
rights |= 0x08;
}
if(board->WQueenCastle) {
rights |= 0x04;
}
if(board->BKingCastle) {
rights |= 0x02;
}
if(board->BQueenCastle) {
rights |= 0x01;
}
return rights;
}
/*
* Return TRUE if the position on board matches the entry details
* for the purposes of position repetition matches:
* + Same board position (based on a hash value)
* + Same castling rights.
* + Same en passant status (i.e., no ep possible).
* + Same player to move.
*/
static Boolean
position_matches(PositionCount *entry, const Board *board)
{
if(board->weak_hash_value != entry->hash_value) {
return FALSE;
}
else if(board->to_move != entry->to_move) {
return FALSE;
}
else if(encode_castling_rights(board) != entry->castling_rights) {
return FALSE;
}
else {
if(board->EnPassant) {
return board->ep_rank == entry->ep_rank && board->ep_col == entry->ep_col;
}
else {
return entry->ep_rank == '\0';
}
}
}
/*
* Add hash_value as a position in the current game.
* Return the number of times this position has occurred.
* NB: The assumption is that position_counts is not NULL.
*/
unsigned
update_position_counts(PositionCount *position_counts, const Board *board)
{
//fprintf(stderr, "U: %d,%d\n", board->ep_rank, board->ep_col);
PositionCount *entry = position_counts;
if (position_counts == NULL) {
/* Don't try to match in variations. */
return 0;
}
/* Try to find an existing entry. */
while (entry != NULL && !position_matches(entry, board)) {
entry = entry->next;
}
if (entry == NULL) {
/* New position. */
entry = new_position_count_list(board);
/* Insert just after the head of the list. */
entry->next = position_counts->next;
position_counts->next = entry;
}
else {
/* Increment the count. */
entry->count++;
}
return entry->count;
}
/*
* Free the list of position counts.
*/
void
free_position_count_list(PositionCount *position_counts)
{
PositionCount *entry = position_counts;
while (entry != NULL) {
PositionCount *next = entry->next;
(void) free((void *) entry);
entry = next;
}
}
/*
* Create a new position count list.
* This will have a single entry at its head.
*/
PositionCount *
new_position_count_list(const Board *board)
{
PositionCount *head = (PositionCount *) malloc_or_die(sizeof (*head));
head->hash_value = board->weak_hash_value;
head->to_move = board->to_move;
head->castling_rights = encode_castling_rights(board);
if(board->EnPassant) {
head->ep_rank = board->ep_rank;
head->ep_col = board->ep_col;
}
else {
head->ep_rank = '\0';
head->ep_col = '\0';
}
head->count = 1;
head->next = NULL;
return head;
}
/*
* Return an identical copy of the given list.
*/
PositionCount *copy_position_count_list(PositionCount *original)
{
PositionCount *copy = NULL;
PositionCount *tail = NULL;
while (original != NULL) {
PositionCount *entry = (PositionCount *) malloc_or_die(sizeof (*entry));
entry->hash_value = original->hash_value;
entry->to_move = original->to_move;
entry->castling_rights = original->castling_rights;
entry->count = original->count;
entry->next = NULL;
if (copy == NULL) {
copy = entry;
}
else {
tail->next = entry;
}
tail = entry;
original = original->next;
}
return copy;
}
/* Determine which table to initialise, depending
* on whether use_virtual_hash_table is set or not.
*/
void
init_duplicate_hash_table(void)
{
int i;
if (GlobalState.use_virtual_hash_table) {
VirtualLogTable = (LogHeaderEntry *)
malloc_or_die(LOG_TABLE_SIZE * sizeof (*VirtualLogTable));
for (i = 0; i < LOG_TABLE_SIZE; i++) {
VirtualLogTable[i].head = VirtualLogTable[i].tail = -1;
}
hash_file = fopen(VIRTUAL_FILE, "w+b");
if (hash_file == NULL) {
fprintf(GlobalState.logfile, "Unable to open %s\n",
VIRTUAL_FILE);
}
}
else {
LogTable = (HashLog**) malloc_or_die(LOG_TABLE_SIZE * sizeof (*LogTable));
for (i = 0; i < LOG_TABLE_SIZE; i++) {
LogTable[i] = NULL;
}
}
}
/* Close and remove the temporary file if in use. */
void
clear_duplicate_hash_table(void)
{
if (GlobalState.use_virtual_hash_table) {
if (hash_file != NULL) {
(void) fclose(hash_file);
unlink(VIRTUAL_FILE);
hash_file = NULL;
}
}
}
/* Retrieve a duplicate table entry from the hash file. */
static int
retrieve_virtual_entry(long ix, VirtualHashLog *entry)
{
if (hash_file == NULL) {
return 0;
}
else if (fseek(hash_file, ix, SEEK_SET) != 0) {
fprintf(GlobalState.logfile,
"Fseek error to %ld in retrieve_virtual_entry\n", ix);
return 0;
}
else if (fread((void *) entry, sizeof (*entry), 1, hash_file) != 1) {
fprintf(GlobalState.logfile,
"Fread error from %ld in retrieve_virtual_entry\n", ix);
return 0;
}
else {
return 1;
}
}
/* Write a duplicate table entry to the hash file. */
static int
write_virtual_entry(long where, const VirtualHashLog *entry)
{
if (fseek(hash_file, where, SEEK_SET) != 0) {
fprintf(GlobalState.logfile,
"Fseek error to %ld in write_virtual_entry\n", where);
return 0;
}
else if (fwrite((void *) entry, sizeof (*entry), 1, hash_file) != 1) {
fprintf(GlobalState.logfile,
"Fwrite error from %ld in write_virtual_entry\n", where);
fflush(hash_file);
return 0;
}
else {
/* Written ok. */
return 1;
}
}
/* Return the name of the original file if it looks like we
* have met the moves in game_details before, otherwise return
* NULL. A match is assumed to be so if both
* the final_ and cumulative_ hash values in game_details
* are already present in VirtualLogTable.
*/
static const char *
previous_virtual_occurance(Game game_details)
{
unsigned ix = game_details.final_hash_value % LOG_TABLE_SIZE;
VirtualHashLog entry;
Boolean duplicate = FALSE;
const char *original_filename = NULL;
/* Are we keeping this information? */
if (GlobalState.suppress_duplicates || GlobalState.suppress_originals ||
GlobalState.duplicate_file != NULL) {
if (VirtualLogTable[ix].head < 0l) {
/* First occurrence. */
}
else {
int keep_going =
retrieve_virtual_entry(VirtualLogTable[ix].head, &entry);
while (keep_going && !duplicate) {
if ((entry.final_hash_value == game_details.final_hash_value) &&
(entry.cumulative_hash_value == game_details.cumulative_hash_value)) {
/* We have a match.
* Determine where it first occured.
*/
original_filename = input_file_name(entry.file_number);
duplicate = TRUE;
}
else if (entry.next >= 0l) {
keep_going = retrieve_virtual_entry(entry.next, &entry);
}
else {
keep_going = 0;
}
}
}
if (!duplicate) {
/* Write an entry for it. */
/* Where to write the next VirtualHashLog entry. */
static long next_free_entry = 0l;
/* Avoid valgrind error when writing unset bytes that
* are part of the structure padding.
*/
memset((void *) &entry, 0, sizeof(entry));
/* Store the XOR of the two hash values. */
entry.final_hash_value = game_details.final_hash_value;
entry.cumulative_hash_value = game_details.cumulative_hash_value;
entry.file_number = current_file_number();
entry.next = -1l;
/* Write out these details. */
if (write_virtual_entry(next_free_entry, &entry)) {
long where_written = next_free_entry;
/* Move on ready for next time. */
next_free_entry += sizeof (entry);
/* Now update the index table. */
if (VirtualLogTable[ix].head < 0l) {
/* First occurrence. */
VirtualLogTable[ix].head =
VirtualLogTable[ix].tail = where_written;
}
else {
VirtualHashLog tail;
if (retrieve_virtual_entry(VirtualLogTable[ix].tail, &tail)) {
tail.next = where_written;
(void) write_virtual_entry(VirtualLogTable[ix].tail, &tail);
/* Store the new tail address. */
VirtualLogTable[ix].tail = where_written;
}
}
}
}
}
return original_filename;
}
/* Return the name of the original file if it looks like we
* have met the moves in game_details before, otherwise return
* NULL.
* For non-fuzzy comparison, a match is assumed to be so if both
* final_ and cumulative_ hash values are already present
* as a pair in LogTable.
* Fuzzy matches depend on the match depth and do not use the
* cumulative hash value.
*/
const char *
previous_occurance(Game game_details, unsigned plycount)
{
const char *original_filename = NULL;
if (GlobalState.use_virtual_hash_table) {
original_filename = previous_virtual_occurance(game_details);
}
else {
/* Are we keeping this information? */
if (GlobalState.suppress_duplicates ||
GlobalState.suppress_originals ||
GlobalState.fuzzy_match_duplicates ||
GlobalState.duplicate_file != NULL) {
Boolean duplicate = FALSE;
// Entry index.
unsigned ix;
HashLog *entry;
ix = game_details.final_hash_value % LOG_TABLE_SIZE;
entry = LogTable[ix];
/* Check for non-fuzzy matches first. */
while (entry != NULL && !duplicate) {
if (entry->final_hash_value == game_details.final_hash_value &&
entry->cumulative_hash_value == game_details.cumulative_hash_value) {
/* An exact match. */
duplicate = TRUE;
/* Determine where it first occurred. */
original_filename = input_file_name(entry->file_number);
}
else {
entry = entry->next;
}
}
if (!duplicate && GlobalState.fuzzy_match_duplicates) {
ix = game_details.fuzzy_duplicate_hash % LOG_TABLE_SIZE;
entry = LogTable[ix];
while (entry != NULL && !duplicate) {
if (GlobalState.fuzzy_match_depth == 0 &&
entry->final_hash_value == game_details.final_hash_value) {
/* Accept positional match at the end of the game. */
duplicate = TRUE;
}
else {
/* Need to check at the fuzzy_match_depth. */
if (entry->final_hash_value == game_details.fuzzy_duplicate_hash) {
duplicate = TRUE;
}
}
if (duplicate) {
/* We have a match.
* Determine where it first occurred.
*/
original_filename = input_file_name(entry->file_number);
}
else {
entry = entry->next;
}
}
}
if (!duplicate) {
/* First occurrence, so add it to the log. */
entry = (HashLog *) malloc_or_die(sizeof (*entry));
if (!GlobalState.fuzzy_match_duplicates) {
/* Store the two hash values. */
entry->final_hash_value = game_details.final_hash_value;
entry->cumulative_hash_value = game_details.cumulative_hash_value;
}
else if (GlobalState.fuzzy_match_depth > 0 &&
plycount >= GlobalState.fuzzy_match_depth) {
/* Store just the hash value from the fuzzy depth. */
entry->final_hash_value = game_details.fuzzy_duplicate_hash;
entry->cumulative_hash_value = 0;
}
else {
/* Store the two hash values. */
entry->final_hash_value = game_details.final_hash_value;
entry->cumulative_hash_value = game_details.cumulative_hash_value;
}
entry->file_number = current_file_number();
/* Link it into the head at this index. */
entry->next = LogTable[ix];
LogTable[ix] = entry;
}
/* Without a filename, suppressing duplicates on stdin does not work. */
if(duplicate && original_filename == NULL) {
original_filename = "_stdin_";
}
}
}
return original_filename;
}
/* Define a table to hold the zobrist/polyglot hash codes of starting positions.
* Size should be a prime number for collision avoidance.
*/
#define SETUP_TABLE_SIZE 31957
static HashLog *polyglot_codes_of_interest[SETUP_TABLE_SIZE];
/* Whether the standard starting position has been seen in the
* games processed. This avoids having to generate the zobrist
* hash for all games that have no Setup/FEN tags.
*/
static Boolean standard_start_seen = FALSE;
/* Check whether the starting position of the given game
* has been met before.
* Return FALSE if duplicate starting positions are not being
* deleted or if the current position has not been met before.
* Otherwise return TRUE.
*/
Boolean check_duplicate_setup(const Game *game_details)
{
Boolean keep = TRUE;
if(GlobalState.delete_same_setup) {
if(game_details->tags[FEN_TAG] != NULL) {
uint64_t hash = generate_zobrist_hash_from_fen(game_details->tags[FEN_TAG]);
unsigned ix = hash % SETUP_TABLE_SIZE;
Boolean found = FALSE;
for (HashLog *entry = polyglot_codes_of_interest[ix]; !found && (entry != NULL);
entry = entry->next) {
/* We can test against just the position value. */
if (entry->final_hash_value == hash) {
found = TRUE;
}
}
if(found) {
keep = FALSE;
}
else {
HashLog *entry = (HashLog *) malloc_or_die(sizeof (*entry));
/* We don't include the cumulative hash value as this
* is the starting position.
*/
entry->cumulative_hash_value = 0;
entry->final_hash_value = hash;
/* Link it into the head at this index. */
entry->next = polyglot_codes_of_interest[ix];
polyglot_codes_of_interest[ix] = entry;
}
}
else if(standard_start_seen) {
keep = FALSE;
}
else {
standard_start_seen = TRUE;
}
}
return keep;
}