#include "LCSegmentMerger.h"
#include "LCSegmentMergeInfo.h"
#include "LCSegmentMergeQueue.h"
#include "LCFieldInfos.h"
#include "LCFieldInfo.h"
#include "LCFieldsWriter.h"
#include "LCTermInfosWriter.h"
#include "LCTermInfo.h"
#include "LCTerm.h"
#include "LCTermVectorsWriter.h"
#include "LCIndexReader.h"
#include "LCIndexWriter.h"
#include "LCCompoundFileWriter.h"
#include "LCField.h"
#include "LCIndexOutput.h"
#include "LCRAMOutputStream.h"
#include "GNUstep.h"
/**
* The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}),
* into a single Segment. After adding the appropriate readers, call the merge method to combine the
* segments.
*
* If the compoundFile flag is set, then the segments will be merged into a compound file.
*
*
* @see #merge
* @see #add
*/
@interface LCSegmentMerger (LCPrivate)
- (int) mergeFields;
- (void) mergeVectors;
- (void) mergeTerms;
- (void) mergeTermInfos;
- (void) mergeTermInfo: (NSArray *) smis size: (int) n;
- (int) appendPosting: (NSArray *) smis size: (int) n;
- (void) resetSkip;
- (void) bufferSkip: (int) doc;
- (long) writeSkip;
- (void) mergeNorms;
- (void) addIndexed: (LCIndexReader *) r
fieldInfos: (LCFieldInfos *) fi
names: (NSArray *) n
isTermVectorStored: (BOOL) tv
isStorePositionsWithTermVector: (BOOL) pos
isStoreOffsetWithTermVector: (BOOL) off;
@end
@implementation LCSegmentMerger
- (id) init
{
self = [super init];
termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL;
readers = [[NSMutableArray alloc] init];
// File extensions of old-style index files
COMPOUND_EXTENSIONS = [[NSArray alloc] initWithObjects:
@"fnm", @"frq", @"prx", @"fdx", @"fdt", @"tii", @"tis", nil];
VECTOR_EXTENSIONS = [[NSArray alloc] initWithObjects:
@"tvx", @"tvd", @"tvf", nil];
skipBuffer = [[LCRAMOutputStream alloc] init];
return self;
}
- (void) dealloc
{
DESTROY(directory);
DESTROY(segment);
DESTROY(readers);
DESTROY(fieldInfos);
DESTROY(COMPOUND_EXTENSIONS);
DESTROY(VECTOR_EXTENSIONS);
DESTROY(freqOutput);
DESTROY(proxOutput);
DESTROY(termInfosWriter);
DESTROY(queue);
DESTROY(skipBuffer);
[super dealloc];
}
/** This ctor used only by test code.
*
* @param dir The Directory to merge the other segments into
* @param name The name of the new segment
*/
- (id) initWithDirectory: (id ) dir name: (NSString *) name
{
self = [self init];
ASSIGN(directory, dir);
ASSIGN(segment, name);
return self;
}
- (id) initWithIndexWriter: (LCIndexWriter *) writer name: (NSString *) name
{
self = [self initWithDirectory: [writer directory] name: name];
termIndexInterval = [writer termIndexInterval];
return self;
}
/**
* Add an IndexReader to the collection of readers that are to be merged
* @param reader
*/
- (void) addIndexReader: (LCIndexReader *) reader
{
[readers addObject: reader];
}
/**
*
* @param i The index of the reader to return
* @return The ith reader to be merged
*/
- (LCIndexReader *) segmentReader: (int) i
{
return (LCIndexReader *) [readers objectAtIndex: i];
}
/**
* Merges the readers specified by the {@link #add} method into the directory passed to the constructor
* @return The number of documents that were merged
* @throws IOException
*/
- (int) merge
{
int value;
value = [self mergeFields];
[self mergeTerms];
[self mergeNorms];
if ([fieldInfos hasVectors])
{
[self mergeVectors];
}
return value;
}
/**
* close all IndexReaders that have been added.
* Should not be called before merge().
* @throws IOException
*/
- (void) closeReaders
{
int i;
for (i = 0; i < [readers count]; i++) { // close readers
LCIndexReader *reader = (LCIndexReader *) [readers objectAtIndex: i];
[reader close];
}
}
- (NSArray *) createCompoundFile: (NSString *) fileName
{
LCCompoundFileWriter *cfsWriter = [[LCCompoundFileWriter alloc] initWithDirectory: directory name: fileName];
NSMutableArray *files = [[NSMutableArray alloc] init];
// Basic files
NSString *file;
int i;
for (i = 0; i < [COMPOUND_EXTENSIONS count]; i++) {
file = [segment stringByAppendingPathExtension: [COMPOUND_EXTENSIONS objectAtIndex: i]];
[files addObject: file];
}
// Field norm files
for (i = 0; i < [fieldInfos size]; i++) {
LCFieldInfo *fi = [fieldInfos fieldInfoWithNumber: i];
if ([fi isIndexed] && (![fi omitNorms])) {
file = [segment stringByAppendingPathExtension: [NSString stringWithFormat: @"f%d", i]];
[files addObject: file];
}
}
// Vector files
if ([fieldInfos hasVectors]) {
for (i = 0; i < [VECTOR_EXTENSIONS count]; i++) {
file = [segment stringByAppendingPathExtension: [VECTOR_EXTENSIONS objectAtIndex: i]];
[files addObject: file];
}
}
// Now merge all added files
NSEnumerator *e = [files objectEnumerator];
while ((file = [e nextObject])) {
[cfsWriter addFile: file];
}
// Perform the merge
[cfsWriter close];
DESTROY(cfsWriter);
return AUTORELEASE(files);
}
- (void) addIndexed: (LCIndexReader *) r
fieldInfos: (LCFieldInfos *) fi
names: (NSArray *) n
isTermVectorStored: (BOOL) tv
isStorePositionWithTermVector: (BOOL) pos
isStoreOffsetWithTermVector: (BOOL) off
{
NSEnumerator *e = [n objectEnumerator];
NSString *field;
while ((field = [e nextObject]))
{
[fi addName: field
isIndexed: YES
isTermVectorStored: tv
isStorePositionWithTermVector: pos
isStoreOffsetWithTermVector: off
omitNorms: (![r hasNorms: field])];
}
}
/**
*
* @return The number of documents in all of the readers
* @throws IOException
*/
- (int) mergeFields
{
ASSIGN(fieldInfos, AUTORELEASE([[LCFieldInfos alloc] init])); // merge field names
//fieldInfos = [[LCFieldInfos alloc] init]; // merge field names
int docCount = 0;
int i;
LCIndexReader *reader;
for (i = 0; i < [readers count]; i++) {
reader = (LCIndexReader *) [readers objectAtIndex: i];
[self addIndexed: reader
fieldInfos: fieldInfos
names: [reader fieldNames: LCFieldOption_TERMVECTOR_WITH_POSITION_OFFSET]
isTermVectorStored: YES
isStorePositionWithTermVector: YES
isStoreOffsetWithTermVector: YES];
[self addIndexed: reader
fieldInfos: fieldInfos
names: [reader fieldNames: LCFieldOption_TERMVECTOR_WITH_POSITION]
isTermVectorStored: YES
isStorePositionWithTermVector: YES
isStoreOffsetWithTermVector: NO];
[self addIndexed: reader
fieldInfos: fieldInfos
names: [reader fieldNames: LCFieldOption_TERMVECTOR_WITH_OFFSET]
isTermVectorStored: YES
isStorePositionWithTermVector: NO
isStoreOffsetWithTermVector: YES];
[self addIndexed: reader
fieldInfos: fieldInfos
names: [reader fieldNames: LCFieldOption_TERMVECTOR]
isTermVectorStored: YES
isStorePositionWithTermVector: NO
isStoreOffsetWithTermVector: NO];
[self addIndexed: reader
fieldInfos: fieldInfos
names: [reader fieldNames: LCFieldOption_INDEXED]
isTermVectorStored: NO
isStorePositionWithTermVector: NO
isStoreOffsetWithTermVector: NO];
[fieldInfos addCollection: [reader fieldNames: LCFieldOption_UNINDEXED]
isIndexed: NO];
}
NSString *file = [segment stringByAppendingPathExtension: @"fnm"];
[fieldInfos write: directory name: file];
LCFieldsWriter *fieldsWriter = // merge field values
[[LCFieldsWriter alloc] initWithDirectory: directory
segment: segment
fieldInfos: fieldInfos];
for (i = 0; i < [readers count]; i++) {
LCIndexReader *reader = (LCIndexReader *) [readers objectAtIndex: i];
int maxDoc = [reader maximalDocument];
int j;
for (j = 0; j < maxDoc; j++)
if (![reader isDeleted: j]) { // skip deleted docs
[fieldsWriter addDocument: [reader document: j]];
docCount++;
}
}
[fieldsWriter close];
DESTROY(fieldsWriter);
return docCount;
}
/**
* Merge the TermVectors from each of the segments into the new one.
* @throws IOException
*/
- (void) mergeVectors
{
LCTermVectorsWriter *termVectorsWriter =
[[LCTermVectorsWriter alloc] initWithDirectory: directory segment: segment fieldInfos: fieldInfos];
int r;
for (r = 0; r < [readers count]; r++) {
LCIndexReader *reader = (LCIndexReader *) [readers objectAtIndex: r];
int maxDoc = [reader maximalDocument];
int docNum;
for (docNum = 0; docNum < maxDoc; docNum++) {
// skip deleted docs
if ([reader isDeleted: docNum])
continue;
[termVectorsWriter addAllDocumentVectors: [reader termFrequencyVectors: docNum]];
}
}
[termVectorsWriter close];
DESTROY(termVectorsWriter);
}
- (void) mergeTerms;
{
NSString *file = [segment stringByAppendingPathExtension: @"frq"];
ASSIGN(freqOutput, [directory createOutput: file]);
file = [segment stringByAppendingPathExtension: @"prx"];
ASSIGN(proxOutput, [directory createOutput: file]);
ASSIGN(termInfosWriter, AUTORELEASE([[LCTermInfosWriter alloc] initWithDirectory: directory
segment: segment
fieldInfos: fieldInfos
interval: termIndexInterval]));
skipInterval = [termInfosWriter skipInterval];
ASSIGN(queue, AUTORELEASE([(LCSegmentMergeQueue *)[LCSegmentMergeQueue alloc] initWithSize: [readers count]]));
[self mergeTermInfos];
if (freqOutput != nil) [freqOutput close];
if (proxOutput != nil) [proxOutput close];
if (termInfosWriter != nil) [termInfosWriter close];
if (queue != nil) [queue close];
}
- (void) mergeTermInfos
{
int base = 0;
int i;
for (i = 0; i < [readers count]; i++) {
LCIndexReader *reader = (LCIndexReader *) [readers objectAtIndex: i];
LCTermEnumerator *termEnum = [reader termEnumerator];
LCSegmentMergeInfo *smi = [[LCSegmentMergeInfo alloc] initWithBase: base
termEnumerator: termEnum reader: reader];
base += [reader numberOfDocuments];
if ([smi hasNextTerm])
{
[queue put: smi]; // initialize queue
}
else
[smi close];
DESTROY(smi);
}
NSMutableArray *match = [[NSMutableArray alloc] init];
while ([queue size] > 0) {
int matchSize = 0; // pop matching terms
if (matchSize < [match count])
[match replaceObjectAtIndex: matchSize withObject: [queue pop]];
else
[match addObject: [queue pop]];
matchSize++;
LCTerm *term = [[match objectAtIndex: 0] term];
LCSegmentMergeInfo *top = (LCSegmentMergeInfo *) [queue top];
while (top != nil && [term compare: [top term]] == NSOrderedSame) {
if (matchSize < [match count])
[match replaceObjectAtIndex: matchSize withObject: [queue pop]];
else
[match addObject: [queue pop]];
matchSize++;
top = (LCSegmentMergeInfo *) [queue top];
}
[self mergeTermInfo: match size: matchSize]; // add new TermInfo
while (matchSize > 0) {
LCSegmentMergeInfo *smi = [match objectAtIndex: --matchSize];
if ([smi hasNextTerm])
[queue put: smi]; // restore queue
else
[smi close]; // done with a segment
}
}
DESTROY(match);
}
/** Merge one term found in one or more segments. The array smis
* contains segments that are positioned at the same term. N
* is the number of cells in the array actually occupied.
*
* @param smis array of segments
* @param n number of cells in the array actually occupied
*/
- (void) mergeTermInfo: (NSArray *) smis size: (int) n
{
long freqPointer = [freqOutput offsetInFile];
long proxPointer = [proxOutput offsetInFile];
int df = [self appendPosting: smis size: n]; // append posting data
long skipPointer = [self writeSkip];
LCTermInfo *ti = [[LCTermInfo alloc] init];
if (df > 0) {
// add an entry to the dictionary with pointers to prox and freq files
[ti setDocumentFrequency: df];
[ti setFreqPointer: freqPointer];
[ti setProxPointer: proxPointer];
[ti setSkipOffset: (long)(skipPointer - freqPointer)];
[termInfosWriter addTerm: [[smis objectAtIndex: 0] term]
termInfo: ti];
DESTROY(ti);
}
}
/** Process postings from multiple segments all positioned on the
* same term. Writes out merged entries into freqOutput and
* the proxOutput streams.
*
* @param smis array of segments
* @param n number of cells in the array actually occupied
* @return number of documents across all segments where this term was found
*/
- (int) appendPosting: (NSArray *) smis size: (int) n
{
int lastDoc = 0;
int df = 0; // number of docs w/ term
[self resetSkip];
int i;
for (i = 0; i < n; i++) {
LCSegmentMergeInfo *smi = [smis objectAtIndex: i];
id postings = [smi postings];
int base = [smi base];
NSArray *docMap = [smi docMap];
[postings seekTermEnumerator: [smi termEnumerator]];
while ([postings hasNextDocument]) {
int doc = [postings document];
if (docMap != nil)
doc = [[docMap objectAtIndex: doc] intValue]; // map around deletions
doc += base; // convert to merged space
if (doc < lastDoc)
{
NSLog(@"docs out of order");
}
df++;
if ((df % skipInterval) == 0) {
[self bufferSkip: lastDoc];
}
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
lastDoc = doc;
int freq = [postings frequency];
if (freq == 1) {
[freqOutput writeVInt: (docCode | 1)]; // write doc & freq=1
} else {
[freqOutput writeVInt: docCode]; // write doc
[freqOutput writeVInt: freq]; // write frequency in doc
}
int lastPosition = 0; // write position deltas
int j;
for (j = 0; j < freq; j++) {
int position = [postings nextPosition];
[proxOutput writeVInt: position - lastPosition];
lastPosition = position;
}
}
}
return df;
}
- (void) resetSkip
{
[skipBuffer reset];
lastSkipDoc = 0;
lastSkipFreqPointer = [freqOutput offsetInFile];
lastSkipProxPointer = [proxOutput offsetInFile];
}
- (void) bufferSkip: (int) doc
{
long freqPointer = [freqOutput offsetInFile];
long proxPointer = [proxOutput offsetInFile];
[skipBuffer writeVInt: (doc - lastSkipDoc)];
[skipBuffer writeVInt: ((int) (freqPointer - lastSkipFreqPointer))];
[skipBuffer writeVInt: ((int) (proxPointer - lastSkipProxPointer))];
lastSkipDoc = doc;
lastSkipFreqPointer = freqPointer;
lastSkipProxPointer = proxPointer;
}
- (long) writeSkip
{
long skipPointer = [freqOutput offsetInFile];
[skipBuffer writeTo: freqOutput];
return skipPointer;
}
- (void) mergeNorms
{
int i;
for (i = 0; i < [fieldInfos size]; i++) {
LCFieldInfo *fi = [fieldInfos fieldInfoWithNumber: i];
if ([fi isIndexed] && (![fi omitNorms])) {
NSString *file = [segment stringByAppendingPathExtension: [NSString stringWithFormat: @"f%d", i]];
LCIndexOutput *output = [directory createOutput: file];
int j;
for (j = 0; j < [readers count]; j++) {
LCIndexReader *reader = (LCIndexReader *) [readers objectAtIndex: j];
NSMutableData *input = [[NSMutableData alloc] init];
[reader setNorms: [fi name] bytes: input offset: 0];
int k;
char *bytes = (char *)[input bytes];
int maxDoc = [input length];
for (k = 0; k < maxDoc; k++) {
if (![reader isDeleted: k]) {
[output writeByte: bytes[k]];
}
}
DESTROY(input);
}
[output close];
}
}
}
@end