#include "LCTermVectorsWriter.h" #include "LCIndexOutput.h" #include "LCTermPositionVector.h" #include "LCTermVectorOffsetInfo.h" #include "NSString+Additions.h" #include "GNUstep.h" /** * Writer works by opening a document and then opening the fields within the document and then * writing out the vectors for each field. * * Rough usage: * for each document { writer.openDocument(); for each field on the document { writer.openField(field); for all of the terms { writer.addTerm(...) } writer.closeField } writer.closeDocument() } * * @version $Id: LCTermVectorsWriter.m 438 2006-01-15 06:39:36Z yjchen $ * */ @interface LCTVField: NSObject { long number; long long tvfPointer; BOOL storePositions; BOOL storeOffsets; } - (id) initWithNumber: (long) number storePosition: (BOOL) storePos storeOffset: (BOOL) storeOff; - (void) setTVFPointer: (long long) p; - (long long) tvfPointer; - (BOOL) storePositions; - (BOOL) storeOffsets; - (long) number; @end @interface LCTVTerm: NSObject { NSString *termText; long freq; NSArray *positions; NSArray *offsets; } - (void) setTermText: (NSString *) text; - (void) setFreq: (long) f; - (void) setPositions: (NSArray *) p; - (void) setOffsets: (NSArray *) o; - (NSString *) termText; - (long) freq; - (NSArray *) positions; - (NSArray *) offsets; @end @interface LCTermVectorsWriter (LCPrivate) - (void) openField: (int) fieldNumber isPositionWithTermVectorStored: (BOOL) storePositionWithTermVector isOffsetWithTermVectorStored: (BOOL) storeOffsetWithTermVector; - (void) addTermInternal: (NSString *) termText freq: (long) freq positions: (NSArray *) positions offsets: (NSArray *) offsets; - (void) writeField; - (void) writeDoc; @end @implementation LCTermVectorsWriter - (id) initWithDirectory: (id ) directory segment: (NSString *) segment fieldInfos: (LCFieldInfos *) fis { self = [super init]; // Open files for TermVector storage NSString *file; file = [segment stringByAppendingPathExtension: TVX_EXTENSION]; ASSIGN(tvx, [directory createOutput: file]); [tvx writeInt: (long)TERM_VECTORS_WRITER_FORMAT_VERSION]; file = [segment stringByAppendingPathExtension: TVD_EXTENSION]; ASSIGN(tvd, [directory createOutput: file]); [tvd writeInt: (long)TERM_VECTORS_WRITER_FORMAT_VERSION]; file = [segment stringByAppendingPathExtension: TVF_EXTENSION]; ASSIGN(tvf, [directory createOutput: file]); [tvf writeInt: (long)TERM_VECTORS_WRITER_FORMAT_VERSION]; ASSIGN(fieldInfos, fis); fields = [[NSMutableArray alloc] init]; terms = [[NSMutableArray alloc] init]; currentDocPointer = -1; return self; } - (void) dealloc { DESTROY(fieldInfos); DESTROY(fields); DESTROY(terms); DESTROY(tvx); DESTROY(tvd); DESTROY(tvf); DESTROY(currentField); [super dealloc]; } - (void) openDocument { [self closeDocument]; currentDocPointer = [tvd offsetInFile]; } - (void) closeDocument { if ([self isDocumentOpen]) { [self closeField]; [self writeDoc]; [fields removeAllObjects]; currentDocPointer = -1; } } - (BOOL) isDocumentOpen { return currentDocPointer != -1; } /** Start processing a field. This can be followed by a number of calls to * addTerm, and a final call to closeField to indicate the end of * processing of this field. If a field was previously open, it is * closed automatically. */ - (void) openField: (NSString *) field { LCFieldInfo *fieldInfo = [fieldInfos fieldInfo: field]; [self openField: [fieldInfo number] isPositionWithTermVectorStored: [fieldInfo isPositionWithTermVectorStored] isOffsetWithTermVectorStored: [fieldInfo isOffsetWithTermVectorStored]]; } - (void) openField: (int) fieldNumber isPositionWithTermVectorStored: (BOOL) storePositionWithTermVector isOffsetWithTermVectorStored: (BOOL) storeOffsetWithTermVector { if (![self isDocumentOpen]) { NSLog(@"Cannot open field when no document is open."); } [self closeField]; ASSIGN(currentField, AUTORELEASE([[LCTVField alloc] initWithNumber: fieldNumber storePosition: storePositionWithTermVector storeOffset: storeOffsetWithTermVector])); } /** Finished processing current field. This should be followed by a call to * openField before future calls to addTerm. */ - (void) closeField { if ([self isFieldOpen]) { // save field and terms [self writeField]; [fields addObject: currentField]; [terms removeAllObjects]; DESTROY(currentField); } } /** Return true if a field is currently open. */ - (BOOL) isFieldOpen { return currentField != nil; } /** Add term to the field's term vector. Field must already be open. * Terms should be added in * increasing order of terms, one call per unique termNum. ProxPointer * is a pointer into the TermPosition file (prx). Freq is the number of * times this term appears in this field, in this document. * @throws IllegalStateException if document or field is not open */ - (void) addTerm: (NSString *) termText freq: (long) freq { [self addTerm: termText freq: freq positions: nil offsets: nil]; } - (void) addTerm: (NSString *) termText freq: (long) freq positions: (NSArray *) positions offsets: (NSArray *) offsets { if (![self isDocumentOpen]) { NSLog(@"Cannot add terms when document is not open"); return; } if (![self isFieldOpen]) { NSLog(@"Cannot add terms when field is not open"); return; } [self addTermInternal: termText freq: freq positions: positions offsets: offsets]; } - (void) addTermInternal: (NSString *) text freq: (long) freq positions: (NSArray *) positions offsets: (NSArray *) offsets { LCTVTerm *term = [[LCTVTerm alloc] init]; [term setTermText: text]; [term setFreq: freq]; [term setPositions: positions]; [term setOffsets: offsets]; [terms addObject: term]; DESTROY(term); } /** * Add a complete document specified by all its term vectors. If document has no * term vectors, add value for tvx. * * @param vectors * @throws IOException */ - (void) addAllDocumentVectors: (NSArray *) vectors { [self openDocument]; if (vectors != nil) { int i; for (i = 0; i < [vectors count]; i++) { BOOL storePositionWithTermVector = NO; BOOL storeOffsetWithTermVector = NO; if ([[vectors objectAtIndex: i] conformsToProtocol: @protocol(LCTermPositionVector)]) { id tpVector = [vectors objectAtIndex: i]; #if 1 if ([tpVector size] > 0 && ([tpVector termPositions: 0] != nil) && ([[tpVector termPositions: 0] count] > 0)) storePositionWithTermVector = YES; if ([tpVector size] > 0 && ([tpVector termOffsets: 0] != nil) && ([[tpVector termOffsets: 0] count] > 0)) storeOffsetWithTermVector = YES; #else if ([tpVector size] > 0 && [tpVector termPositions: 0] != nil) storePositionWithTermVector = YES; if ([tpVector size] > 0 && [tpVector termOffsets: 0] != nil) storeOffsetWithTermVector = YES; #endif LCFieldInfo *fieldInfo = [fieldInfos fieldInfo: [tpVector field]]; [self openField: [fieldInfo number] isPositionWithTermVectorStored: storePositionWithTermVector isOffsetWithTermVectorStored: storeOffsetWithTermVector]; int j; for (j = 0; j < [tpVector size]; j++) { [self addTermInternal: [[tpVector allTerms] objectAtIndex: j] freq: [[[tpVector allTermFrequencies] objectAtIndex: j] longValue] positions: [tpVector termPositions: j] offsets: [tpVector termOffsets: j]]; } [self closeField]; } else { id tfVector = [vectors objectAtIndex: i]; LCFieldInfo *fieldInfo = [fieldInfos fieldInfo: [tfVector field]]; [self openField: [fieldInfo number] isPositionWithTermVectorStored: storePositionWithTermVector isOffsetWithTermVectorStored: storeOffsetWithTermVector]; int j; for (j = 0; j < [tfVector size]; j++) [self addTermInternal: [[tfVector allTerms] objectAtIndex: j] freq: [[[tfVector allTermFrequencies] objectAtIndex: j] intValue] positions: nil offsets: nil]; [self closeField]; } } } [self closeDocument]; } /** Close all streams. */ - (void) close { [self closeDocument]; // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process if (tvx != nil) [tvx close]; if (tvd != nil) [tvd close]; if (tvf != nil) [tvf close]; } - (void) writeField { // remember where this field is written [currentField setTVFPointer: [tvf offsetInFile]]; long size = (long)[terms count]; [tvf writeVInt: size]; BOOL storePositions = [currentField storePositions]; BOOL storeOffsets = [currentField storeOffsets]; char bits = 0x0; if (storePositions) { bits |= STORE_POSITIONS_WITH_TERMVECTOR; } if (storeOffsets) { bits |= STORE_OFFSET_WITH_TERMVECTOR; } [tvf writeByte: bits]; NSString *lastTermText = @""; int i; for (i = 0; i < size; i++) { LCTVTerm *term = (LCTVTerm *)[terms objectAtIndex: i]; long start = (long)[lastTermText positionOfDifference: [term termText]]; long length = (long)([[term termText] length] - start); [tvf writeVInt: start]; // write shared prefix length [tvf writeVInt: length]; // write delta length [tvf writeChars: [term termText] start: start length: length]; // write delta chars [tvf writeVInt: [term freq]]; lastTermText = [term termText]; if(storePositions){ if([term positions] == nil) { NSLog(@"Trying to write positions that are null!"); } // use delta encoding for positions int j; long position = 0; for (j = 0; j < [term freq]; j++){ [tvf writeVInt: (long)([[[term positions] objectAtIndex: j] longValue]- position)]; position = [[[term positions] objectAtIndex: j] longValue]; } } if(storeOffsets){ if([term offsets] == nil) { NSLog(@"Trying to write offsets that are null!"); } // use delta encoding for offsets int j; long position = 0; for (j = 0; j < [term freq]; j++) { [tvf writeVInt: (long)([[[term offsets] objectAtIndex: j] startOffset] - position)]; [tvf writeVInt: (long)([[[term offsets] objectAtIndex: j] endOffset] - [[[term offsets] objectAtIndex: j] startOffset])]; //Save the diff between the two. position = [[[term offsets] objectAtIndex: j] endOffset]; } } } } - (void) writeDoc { if ([self isFieldOpen]) { NSLog(@"Field is still open while writing document"); } // write document index record [tvx writeLong: currentDocPointer]; // write document data record long size = (long)[fields count]; // write the number of fields [tvd writeVInt: size]; // write field numbers int i; for (i = 0; i < size; i++) { LCTVField *field = (LCTVField *) [fields objectAtIndex: i]; [tvd writeVInt: (long)[field number]]; } // write field pointers long lastFieldPointer = 0; for (i = 0; i < size; i++) { LCTVField *field = (LCTVField *) [fields objectAtIndex: i]; [tvd writeVLong: [field tvfPointer] - lastFieldPointer]; lastFieldPointer = [field tvfPointer]; } } @end @implementation LCTVField - (id) init { self = [super init]; tvfPointer = 0; storePositions = NO; storeOffsets = NO; return self; } - (id) initWithNumber: (long) n storePosition: (BOOL) storePos storeOffset: (BOOL) storeOff { self = [self init]; number = n; storePositions = storePos; storeOffsets = storeOff; return self; } - (void) setTVFPointer: (long long) p { tvfPointer = p; } - (long long) tvfPointer { return tvfPointer; } - (BOOL) storePositions { return storePositions; } - (BOOL) storeOffsets { return storeOffsets; } - (long) number { return number; } - (NSString *) description { return [NSString stringWithFormat: @"LCTVField: %ld", number]; } @end @implementation LCTVTerm - (id) init { self = [super init]; freq = 0; positions = nil; offsets = nil; return self; } - (void) dealloc { RELEASE(positions); RELEASE(offsets); RELEASE(termText); [super dealloc]; } - (void) setTermText: (NSString *) text { ASSIGN(termText, text); } - (void) setFreq: (long) f { freq = f; } - (void) setPositions: (NSArray *) p { // Keep a copy ASSIGNCOPY(positions, p); } - (void) setOffsets: (NSArray *) o { // Keep a copy ASSIGNCOPY(offsets, o); } - (NSString *) termText { return termText; } - (long) freq { return freq; } - (NSArray *) positions { return positions; } - (NSArray *) offsets { return offsets; } - (NSString *) description { return [NSString stringWithFormat: @"LCTVTerm %@: %ld", termText, freq]; } @end