// File: TextInd.cp
//
// Description: Source file for the text indexing class
//
// Copyright 1991 by Mark Watson.  All rights reserved
//

#include "applib.h"
#include "textind.h"

void Warning(char *);


extern "C" { void exit(int); int toupper(int); };
// extern "C" { void sprintf(char *, ...); };

//
// String equality comparison. Ignore case.
//
static int STR_EQUAL(Char *s1, Char *s2, int num_to_compare)
{
    for (int i=0; i<num_to_compare; i++) {
        int c1 = s1[i];
        if (c1 >= 'a' && c1 <= 'z')  c1 += 'A' - 'a';
        int c2 = s2[i];
        if (c2 >= 'a' && c2 <= 'z')  c2 += 'A' - 'a';
        if (c1 != c2)  return 0;
    }
    return 1;
}

//
// Utility to free storage for a NULL terminated list of strings:
//
void free_strings(char **s)
{
    int count = 0;
    while (s[count] != (char *)0) {
        delete s[count];
        count++;
    }
    delete s;
}

//
// Compare to compact character blocks for equality:
//
int char_5_block::equal(char_5_block *b) {
    if (n0==b->n0 && n1==b->n1 && n2==b->n2 && n3==b->n3 && 
        n4==b->n4 && n5==b->n5 && n6==b->n6 && n7==b->n7 && 
        n8==b->n8)   return 1;
    return 0;
};

//
// For a given name, find the first file/block pointer in the current index:
//
Long TextIndex::firstHitIndex(char *name) // index in name/pointer space of
{                                         // the first file/file block pointer
    char_5_block compact_chars;
    c_8_to_5(name, &compact_chars);
    int compact_first_char = convert_8_to_5(name[0]);
    Long firstIndex = firstCharIndex[compact_first_char];
    if (firstIndex < 0)  return -1;  
    long endIndex = topNamePointerSpace;
    // look for next non-blank entry in table firstCharIndex:
    for (int bc=compact_first_char +1; bc < 26; bc++) {
        if (firstCharIndex[bc] != -1) {
            endIndex = firstCharIndex[bc];
            break; // out the for loop
        }
    }

    // Search the table until we find a new index entry with the first compact
    // character code > bc, OR we reach the end of the namePointerSpace:
    int last_element, file_number, block_number;
    while (firstIndex < endIndex) {
        if (compact_chars.equal((char_5_block *)&(namePointerSpace[firstIndex])))
            return firstIndex + sizeof(char_5_block);
        // No match, so skip past this compact name field:
        firstIndex += sizeof(char_5_block);
        // Then skip past file pointers:
        do {
            decodeFilePointer((filePointer *)&(namePointerSpace[firstIndex]), last_element,
                              file_number, block_number);
            firstIndex += sizeof(filePointer);
        } while (firstIndex < endIndex && last_element != 0);
    }
    return -1;
}

//
// Decode a compact file pointer:
//
void TextIndex::decodeFilePointer(filePointer *filePtr,
                                  int &last_pointer_flag,  // equals 1 for last entry
                                  int &file_index,         // file index for this entry
                                  int &file_block_index)   // file blocks size=BLOCK_SIZE
{
    filePointer * fp = (filePointer *)filePtr;
    last_pointer_flag = fp->end_of_list;
    file_index = fp->file_index;
    file_block_index = fp->block_index;
}

//
// Encode a compact file pointer:
//
void TextIndex::setFilePointer(filePointer *filePtr,
                               int last_pointer_flag,   // equals 0 for last entry
                               int file_index,          // file index for this entry
                               int file_block_index)    // file blocks size=BLOCK_SIZE
{
    filePtr->end_of_list = last_pointer_flag;
    filePtr->file_index  = file_index;
    filePtr->block_index = file_block_index;
}

//
// TextIndex constructor for building a new index:
//
TextIndex::TextIndex(char **fileList, int num_files)  // constructor for building index
{
    for (int i=0; i<26; i++) firstCharIndex[i] = -1;
    for (i=0; i<num_files; i++) {
        file_names[i] = new char[strlen((char *)fileList[i]) + 1];
        strcpy(file_names[i],fileList[i]);
    }
    number_of_files = num_files;
    topNamePointerSpace = 0;
    buildIndex();
}

//
// TextIndex constructor for restoring from an exsting index file:
//
TextIndex::TextIndex(char *fileName)   // constructor for reloading an old index
{
    restore(fileName);
}

//
// TextIndex destructor:
//
TextIndex::~TextIndex()
{
}

//
// Local helper function to extract individual words from a string:
//
static void parse_words(char *word_buf, char *words[], int &num_words)
{
    int count = 0;
    int charCount = 0;
    for (int k=0; k<8; k++)
        for (int l=0; l<256; l++)  words[k][l] = '\0';
    int len = strlen((char *)word_buf);
    for (int i=0; i<len; i++) {
        if (word_buf[i] < 'A') {
            if (count < 7) count++; // only allow eight words (we should never hit this limit)
            charCount = 0;
        }
        else
            words[count][charCount++] = word_buf[i];
    }
    num_words = count + 1;
}

//
// Add a new word to an existing index:
//
void TextIndex::addWordToIndex(char *word, int file_count, int block_count)
{
    if (file_count > 7) {
        Warning("Warning from TextIndex::addWordToIndex: file_count is stored as 3 bits. Skip word.");
        return;
    }

    // We start by finding where we need to insert the new index. Here are the
    // cases we need to handle:
    //
    //  1)      No indices for any word starting with the first character of 'word'.
    //          We need to search backwards in the small table 'firstCharIndex' in
    //          order to find the insertion point.
    //  2)      Indices exist for any word starting with the first character of 'word'
    //          but none for 'word' itself.
    //  3)      An index exists for the word 'word', add a pointer with the current
    //          file and block indices.
    //  4)      An index exists for the word 'word' with matching file and block indices;
    //          we don't have to do anything in this case.
    //
    // Note: in cases 1, 2, and 3: we need to adjust all pointers in the small table
    //       'firstCharIndex' after the entry for the first letter of 'word'.
    
    char_5_block compact_chars;
    c_8_to_5(word, &compact_chars);
    int compact_first_char = convert_8_to_5(word[0]);
    if (firstCharIndex[compact_first_char] == -1) { // case # 1
        // search forewards in table firstCharIndex:
        Long lastIndex;
        for (int bc=compact_first_char + 1; bc < 26; bc++)
            if (firstCharIndex[bc] != -1) {
                lastIndex = firstCharIndex[bc];
                // We have found the insertion point. First update firstCharIndex[]:
                for (int g=bc; g<26; g++)  // Note: we start at bc, not (bc + 1) here!
                    if (firstCharIndex[g] != -1)
                        firstCharIndex[g] += sizeof(char_5_block) + sizeof(filePointer);
                firstCharIndex[compact_first_char] = lastIndex;
                // Now shift the data in namePointerSpace:
                for (Long t=topNamePointerSpace-1; t>=lastIndex; t--)
                        namePointerSpace[(long)(t+sizeof(char_5_block) + sizeof(filePointer))] =
                            namePointerSpace[t];
                topNamePointerSpace += sizeof(char_5_block) + sizeof(filePointer);
                c_8_to_5(word,(char_5_block *)&(namePointerSpace[lastIndex]));
                setFilePointer((filePointer *)&(namePointerSpace[(long)(lastIndex +
                                                sizeof(char_5_block))]),
                                0,file_count, block_count);
                return;
            }
        // We did not find a first char index in the table with an index value > compact_first_char
        // so we must have the case where we are adding something to the end of namePointerSPace:
        lastIndex = topNamePointerSpace;
        if (firstCharIndex[compact_first_char] < 0)
            firstCharIndex[compact_first_char] = lastIndex;
        topNamePointerSpace += sizeof(char_5_block) + sizeof(filePointer);
        c_8_to_5(word,(char_5_block *)&(namePointerSpace[lastIndex]));
        setFilePointer((filePointer *)&(namePointerSpace[lastIndex +
                                        sizeof(char_5_block)]),
                        0,file_count, block_count);
        return;
    } 
    if (firstCharIndex[compact_first_char] != -1) { // case  3 (MUST be before case 2 code!!
        Long lastIndex = firstCharIndex[compact_first_char];
        // Search the table until we find a new index entry with the first compact
        // character code > bc, OR we reach the end of the namePointerSpace:
        do {
            if (compact_chars.equal((char_5_block *)&(namePointerSpace[lastIndex]))) {
                // See if the current file/block combination is already in the
                // table for this keyword:
                Long fileIndex = lastIndex + sizeof(char_5_block);
                int last_p = 1; int file_num, block_num;
                while (last_p != 0) {
                    decodeFilePointer((filePointer *)&(namePointerSpace[fileIndex]),
                                      last_p, file_num, block_num);
                    if (file_num == file_count && block_num == block_count) {
                        return;
                    }
                    fileIndex += sizeof(filePointer);
                }
                if (firstCharIndex[compact_first_char] < 0)
                    firstCharIndex[compact_first_char] = lastIndex;
                // We have found the insertion point. First update firstCharIndex[]:
                for (int g=compact_first_char+1; g<26; g++)
                    if (firstCharIndex[g] != -1)
                        firstCharIndex[g] += sizeof(filePointer);
                // Now shift the data in namePointerSpace:
                Long fpSize = sizeof(filePointer);
                for (Long t=topNamePointerSpace-1;
                     t>=lastIndex + sizeof(char_5_block);
                     t--)
                    namePointerSpace[t + fpSize] = namePointerSpace[t];
                topNamePointerSpace += sizeof(filePointer);
                setFilePointer((filePointer *)&(namePointerSpace[lastIndex+sizeof(char_5_block)]),1, // 1 not 0!!
                               file_count, block_count);
                return;
            }               
            // skip past the compact name field:
            lastIndex += sizeof(char_5_block);
            // skip past file pointers:
            while (lastIndex < topNamePointerSpace &&
                   (namePointerSpace[lastIndex] & 256) != 0)   lastIndex += sizeof(filePointer);
            lastIndex += sizeof(filePointer);
        } while (lastIndex < topNamePointerSpace);
    } 
    if (firstCharIndex[compact_first_char] != -1) { // case # 2
        Long lastIndex = firstCharIndex[compact_first_char];
        // We have found the insertion point. First update firstCharIndex[]:
        for (int g=compact_first_char+1; g<26; g++)
            if (firstCharIndex[g] != -1)
                firstCharIndex[g] += sizeof(char_5_block) + sizeof(filePointer);
        // Now shift the data in namePointerSpace:
        for (Long t=topNamePointerSpace-1;
             t>=lastIndex;
             t--)
            namePointerSpace[t+sizeof(char_5_block) + sizeof(filePointer)] = namePointerSpace[t];
        topNamePointerSpace += sizeof(char_5_block) + sizeof(filePointer);
        c_8_to_5(word,(char_5_block *)&(namePointerSpace[lastIndex]));
        setFilePointer((filePointer *)&(namePointerSpace[lastIndex+sizeof(char_5_block)]),0,
                       file_count, block_count);
        return;
    }
}

//
// Build a new index in memory from the current file list:
//
void TextIndex::buildIndex_helper(char *file_name, int file_number)
{
    char *words[8]; // up to 8 256 character words
    for (int i=0; i<8; i++) words[i] = new char[256];
    filebuf in_file;
    if (in_file.open(file_name, input)==0) {
        char buf5[256];
        sprintf(buf5,"Could not open input file %s",file_name);
        Warning(buf5);
        exit(1);
    }
    long charCount = 0;
    istream in_stream(&in_file);
    char word_buf[256];
    in_stream >> word_buf;
    while (!in_stream.eof()) {
        int len = strlen((char *)word_buf);
        int blockCount = (int)(charCount / BLOCK_SIZE);
        charCount = in_stream.tellg();
        for (int j=0; j<len; j++) {
            word_buf[j] = toupper(word_buf[j]);
            if (word_buf[j] < 'A' || word_buf[j] > 'Z')  word_buf[j] = ' ';
        }
        int num_words;
        parse_words(word_buf, words, num_words);
        for (int m=0; m<num_words; m++)
            if (strlen((char *)words[m]) > 2)   // skip 1 and 2 letter words
                addWordToIndex(words[m], file_number, blockCount);
        in_stream >> word_buf;
    }
    in_file.close();
    for (i=0; i<8; i++) delete words[i];
}

void TextIndex::buildIndex() // returns 0 for OK, 1 for ERROR
{
    for (int f=0; f<number_of_files; f++) {
        buildIndex_helper(file_names[f], f);
    }
}

//
// Save the current index to a dsik file:
//
int TextIndex::save(char *indexFileName)
{
    filebuf out_file;
    if (out_file.open(indexFileName, output)==0) {
        char buf5[256];
        sprintf(buf5,"Could not open output file %s",indexFileName);
        Warning(buf5);
        exit(1);
    }
    
    ostream out_stream(&out_file);
    out_stream.write((unsigned char *)&number_of_files, sizeof(int));
    out_stream.write((unsigned char *)&topNamePointerSpace, sizeof(long));
    out_stream.write((unsigned char *)firstCharIndex, 28 * sizeof(long));
    out_stream.write((unsigned char *)&(namePointerSpace[0]), topNamePointerSpace);  // fix this!!
    for (int i=0; i<number_of_files; i++)
        out_stream << file_names[i] << "\n";
    out_file.close();

    return 0;
}

//
// Restore an exisitng index from disk:
//
int TextIndex::restore(char *indexFileName)
{
    filebuf in_file;
    if (in_file.open(indexFileName, input)==0) {
        char buf5[256];
        sprintf(buf5,"Could not open input file %s",indexFileName);
        Warning(buf5);
        exit(1);
    }
    
    istream in_stream(&in_file);
    in_stream.read((unsigned char *)&number_of_files, sizeof(int));
    in_stream.read((unsigned char *)&topNamePointerSpace, sizeof(long));
    in_stream.read((unsigned char *)firstCharIndex, 28 * sizeof(long));
    in_stream.read((unsigned char *)&(namePointerSpace[0]), topNamePointerSpace);
    for (int i=0; i<number_of_files; i++) {
        file_names[i] = new char[64];
        in_stream >> file_names[i];
    }
    in_file.close();

    return 0;
}

//
//  Collect text regions containing a specified word:
//

static int last_block[64], last_file[64];
static int numReturnedRegions = 0;

Char **TextIndex::regionTextBlocks(char *input_name) // returns 0 if name not indexed
{
    int len = strlen((char *)input_name);  if (len > 8)  len = 8;
    if (len < 2) {
        return (Char **)0;
    }
    
    char name[64];
    sprintf(name,"%s",input_name);
    for (int m=0; m<strlen((char *)name); m++)
        if (name[m] >= 'a' && name[m] <= 'z')  name[m] += 'A' - 'a';
    
    Long firstIndex = firstHitIndex(name);
    if (firstIndex < 0) {
        char buf8[256];
        sprintf(buf8,"No occurrences of %s",input_name);
        Warning(buf8);
        return (Char **)0;
    }
        
    char **returnList = new char * [64]; // limit of 64 returned text regions
    numReturnedRegions = 0;
    
    // Loop over all fileblock pointers for this index table entry for 'name':
    int last_element, file_number, block_number;
    do {
        decodeFilePointer((filePointer *)&(namePointerSpace[firstIndex]), last_element,
                          file_number, block_number);
        // Collect text from this file/block pointer to return to calling function:
        
        filebuf in_file;
        if (in_file.open(file_names[file_number], input)==0) {
//          cerr << "Could not open input file " << file_names[f] << "\n";
            exit(1);
        }
        long charCount = 0;
        
        istream in_stream(&in_file);
        char word_buf[BLOCK_SIZE+28];
        long init_file_pos = block_number * BLOCK_SIZE - 12;
        if (init_file_pos < 0)  init_file_pos = 0;
        in_stream.seekg(init_file_pos);
        in_stream.read(word_buf,BLOCK_SIZE+26);
        int num_chars_read = in_stream.gcount();
                
        for (int n=0; n<(num_chars_read - len); n++) {
            if (STR_EQUAL(name, &(word_buf[n]), len) == 1) {
                // For debug, just spill the surrounding characters of
                // the match to stderr:
                int first = n - 32;    if (first < 0)  first = 0;
                int last = first + 90; if (last > num_chars_read) last = num_chars_read;
                returnList[numReturnedRegions] = new char[2 + last - first];
                for (int l=first; l<last; l++)
                    if (word_buf[l] > 20) returnList[numReturnedRegions][l-first] = word_buf[l];
                      else                returnList[numReturnedRegions][l-first] = ' ';
                returnList[numReturnedRegions][last-first] = '\0';
                last_file[numReturnedRegions] = file_number;
                last_block[numReturnedRegions] = block_number;
                if (numReturnedRegions < 63) numReturnedRegions++;
            }
        }
        
        in_file.close();
                
        // Bump the file pointer index:
        firstIndex += sizeof(filePointer);
    } while (last_element == 1);
    returnList[numReturnedRegions] = (char *)0;
    return returnList;
}


int TextIndex::regionIndex(int occurence, int &file, int &block) // returns 0 if name not indexed
{
    if (numReturnedRegions < 1 || occurence > (numReturnedRegions-1)) {
        Warning("No occurences of selected word");
        return 0;
    }

    file = last_file[occurence];
    block = last_block[occurence];
    return 1;
}

void TextIndex::get_text(char *buffer, int buffer_size, int file, int block)
{
    filebuf in_file;
    if (in_file.open(file_names[file], input)==0) {
//          cerr << "Could not open input file " << file_names[f] << "\n";
        exit(1);
    }
    istream in_stream(&in_file);
    char word_buf[BLOCK_SIZE+28];
    long init_file_pos = block * BLOCK_SIZE - 12;
    if (init_file_pos < 0)  init_file_pos = 0;
    in_stream.seekg(init_file_pos);
    in_stream.read(buffer,buffer_size-1);
    int num_chars_read = in_stream.gcount();
    buffer[num_chars_read] = '\0';
//cerr << "\n\nretrieved text:\n" << buffer << "\n";
}


// Utilities for converting between 5 bit compact character codes and
// 8 bit character codes:
void TextIndex::c_5_to_8(char_5_block *compactChars, Char *bufferFor8BitChars)
{
    bufferFor8BitChars[0] = convert_5_to_8(compactChars->n0);
    bufferFor8BitChars[1] = convert_5_to_8(compactChars->n1);
    bufferFor8BitChars[2] = convert_5_to_8(compactChars->n2);
    bufferFor8BitChars[3] = convert_5_to_8(compactChars->n3);
    bufferFor8BitChars[4] = convert_5_to_8(compactChars->n4);
    bufferFor8BitChars[5] = convert_5_to_8(compactChars->n5);
    bufferFor8BitChars[6] = convert_5_to_8(compactChars->n6);
    bufferFor8BitChars[7] = convert_5_to_8(compactChars->n7);
    bufferFor8BitChars[8] = convert_5_to_8(compactChars->n8);
    bufferFor8BitChars[9] = '\0';
}
void TextIndex::c_8_to_5(Char *originalEightBitChars, char_5_block *bufferForCompactChars)
{   char eightBitChars[256];
    sprintf(eightBitChars, "%s         ", originalEightBitChars);
    bufferForCompactChars->n0 = convert_8_to_5(eightBitChars[0]);
    bufferForCompactChars->n1 = convert_8_to_5(eightBitChars[1]);
    bufferForCompactChars->n2 = convert_8_to_5(eightBitChars[2]);
    bufferForCompactChars->n3 = convert_8_to_5(eightBitChars[3]);
    bufferForCompactChars->n4 = convert_8_to_5(eightBitChars[4]);
    bufferForCompactChars->n5 = convert_8_to_5(eightBitChars[5]);
    bufferForCompactChars->n6 = convert_8_to_5(eightBitChars[6]);
    bufferForCompactChars->n7 = convert_8_to_5(eightBitChars[7]);
    bufferForCompactChars->n8 = convert_8_to_5(eightBitChars[8]);
}

int TextIndex::convert_5_to_8(int bit_5)
{   int bit_8;
    if (bit_5 == 26)
        bit_8 = '\n';
    else if (bit_5 == 27) // space character
        bit_8 = ' ';
    else
        bit_8 = bit_5 + 'A';
    return bit_8;
}

int TextIndex::convert_8_to_5(int bit_8)
{   int bit_5;
    if (bit_8 >= 'a' && bit_8 <= 'z')  bit_8 += 'A' - 'a';
    if (bit_8 == '\n')
        bit_5 = 26;
    else if (bit_8 == ' ')
        bit_5 = 27;
    else if (bit_8 >= 'A' && bit_8 <= 'Z')      
        bit_5 = bit_8 - 'A';
    else
        bit_5 = 27;
    return bit_5;
}       
        
void TextIndex::print_out()
{
#if 0
    cerr << "\n\nDebug printout of the index table: topNamePointerSpace="
         << topNamePointerSpace << "\n\nfirstCharIndex table:\n\n";
    for (int i=0; i<26; i++)
        cerr << "     " << i << " : " << firstCharIndex[i] << "\n";
    cerr << "\nTable (address, name, filepointers.....)\n\n";

    Long lastIndex = 0;
    // Search the table until we find a new index entry with the first compact
    // character code > bc, OR we reach the end of the namePointerSpace:
    int last_element, file_number, block_number;
    char name[10]; 
    while (lastIndex < (topNamePointerSpace-1)) {
        for (int m=0; m<10; m++) name[m] = '\0';
        c_5_to_8((char_5_block *)&(namePointerSpace[lastIndex]), (Char *)name);
        cerr << "\n  Address: " << lastIndex << ", name: " << name << " ";
        // skip past the compact name field:
        lastIndex += sizeof(char_5_block);
        // skip past file pointers:
        do {
            decodeFilePointer((filePointer *)&(namePointerSpace[lastIndex]), last_element,
                              file_number, block_number);
            cerr << " block #: " << block_number;
            lastIndex += sizeof(filePointer);
        } while (lastIndex < (topNamePointerSpace-1) &&
                 last_element != 0);
    }
    cerr << "\n";
#endif
}

// Test code:

#if 0

// define this flag to test restore-from-file option:
//#define RESTORE 1

char *testFileList[] = {"sample.txt"};

void Warning(char *message)
{
    cerr << message << "\n";
}

void main()
{

#ifndef RESTORE 
    TextIndex ti(testFileList,1);
#endif

#ifdef RESTORE
    TextIndex ti("test.index");
#endif

    ti.print_out();
    
    cerr << "Calling regionTextBlocks: design\n";
    char **s = ti.regionTextBlocks("design");
    int count = 0;
    if (s != 0) {
      while (s[count] != (char *)0) {
          cerr << "match: " << s[count] << "\n";
          count++;
      }
      cerr << "Freeing string storage:\n";
      free_strings(s);
    }
    
#ifndef RESTORE 
    ti.save("test.index");
#endif
}

#endif
