/// /// Copyright © 2003-2008 JetBrains s.r.o. /// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder). /// using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using JetBrains.DataStructures; using JetBrains.Omea.Containers; namespace JetBrains.Omea.TextIndex { public class IndexConstructor { public static string WorkDir { get { return strWorkDir; } set { strWorkDir = value; } } public static void FlushDocument( TermIndexAccessor termIndex, int docId, int maxTermInDoc, IntHashTable tokens ) { foreach( IntHashTable.Entry e in tokens ) { try { termIndex.AddRecord( docId, e.Key, e.Value, maxTermInDoc ); } catch( Exception exc ) { Trace.WriteLineIf( !FullTextIndexer._suppTrace, "-- IndexConstructor -- Flushing document -- exception occured with key " + e.Key ); throw new FormatException( "-- IndexConstructor -- Flushing document -- exception occured with key " + e.Key, exc ); } } } #region IndexConstruction internal static void WriteEntry( BinaryWriter writer, int docId, int termId, object instances, int maxTermInDoc ) { List offsets = instances as List; int instancesOnDoc = ( offsets == null ) ? 1 : offsets.Count; float tfIdf = CalcMetric( instancesOnDoc, maxTermInDoc, 1.0 ); if( offsets != null ) { WriteEntry( writer, docId, tfIdf, offsets ); } else { WriteEntry( writer, docId, tfIdf, (long) instances ); } } private static float CalcMetric( int instancesNumber, int maxTermFreqInDoc, double extRelevanceRatio ) { double tf = 0.5 + 0.5 * ((double)instancesNumber) / ((double)maxTermFreqInDoc ); return( (float)( tf * extRelevanceRatio ) ); } internal static void WriteSignature( BinaryWriter fileIndex ) { long dateInTicks = DateTime.Now.Ticks; fileIndex.Write( dateInTicks ); fileIndex.Write( IndexAccessorImpl.Version ); // Version control signature fileIndex.Write( 0x7FFFFFFF ); // Maximal term/doc ID (to be written later) } private static void WriteEntry( BinaryWriter writer, int docID, float tfIdf, ICollection offsets ) { WriteCount( writer, docID ); writer.Write( tfIdf ); WriteCount( writer, offsets.Count - 1 ); // save count minus 1 foreach( long offset in offsets ) { // long value "Offset" consists of 3 fields: // - (ushort) token order // - (ushort) token sentence number // - (int) token offset; writer.Write( (uint) ( offset & 0xffffffff ) ); writer.Write( (uint) ( offset >> 32 ) ); } } private static void WriteEntry( BinaryWriter writer, int docID, float tfIdf, long offset ) { WriteCount( writer, docID ); writer.Write( tfIdf ); WriteCount( writer, 0 ); // save count minus 1 // long value "Offset" consists of 3 fields: // - (ushort) token order // - (ushort) token sentence number // - (int) token offset; writer.Write( (uint) ( offset & 0xffffffff ) ); writer.Write( (uint) ( offset >> 32 ) ); } internal static void WriteCount( BinaryWriter writer, int count ) { Debug.Assert( count >= 0 ); while( count > 0x7f ) { writer.Write( (byte) ( count & 0x7f ) ); count >>= 7; } writer.Write( (byte) ( count + 0x80 ) ); } internal static int ReadCount( BinaryReader reader ) { int count = 0; int bits = 0; byte b; do { b = reader.ReadByte(); count += ( ( b & 0x7f ) << bits ); bits += 7; } while( ( b & 0x80 ) == 0 ); return count; } #endregion IndexConstruction #region Attributes public const int ciSignatureLength = 28; private static string strWorkDir; #endregion } }