///
/// Copyright © 2003-2008 JetBrains s.r.o.
/// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder).
///
using System;
using System.IO;
using System.Collections;
using System.Diagnostics;
using JetBrains.Omea.OpenAPI;
namespace JetBrains.Omea.TextIndex
{
//-----------------------------------------------------------------------------
// Wrapper around binary representation of the term record.
// Structure:
// 8b Record Term hash code (HC)
// 1b Term Descriptor (not used now)
// 4b Chaining Offset
// 4b Amount of Entries
// {Entry}+
//
// Entry structure:
// 4b Document (resource) index
// 4b Amount of Instances
//-----------------------------------------------------------------------------
public class TermIndexRecord
{
public TermIndexRecord( BinaryReader reader )
{
try
{
listTemporaryStorage.Clear();
HC = IndexConstructor.ReadCount( reader );
while( true )
{
ParseEntry( reader );
// _chainsCount++;
}
}
catch( EndOfStreamException )
{
if( listTemporaryStorage.Count > 0 )
{
aEntries = new Entry[ listTemporaryStorage.Count ];
listTemporaryStorage.CopyTo( aEntries );
}
}
}
//-------------------------------------------------------------------------
// Parser plain sequence of bytes into the entries and their instances.
// Comment: Some entries may be marked as "removed", that means that
// corresponding documents are no longer exist. Thus field
// "DocsNumber" counts *ALL* entries - valid and removed, since
// we do not have an ability to physically strip sequence of
// bytes. Non-existing documents are marked with "-1" as DocID
// Thus we have to allocate actual space only AFTER the number of
// entries is known.
//-------------------------------------------------------------------------
protected static void ParseEntry( BinaryReader reader )
{
int instancesNumber;
Entry new_ = new Entry();
new_.DocIndex = IndexConstructor.ReadCount( reader );
new_.TfIdf = reader.ReadSingle();
instancesNumber = IndexConstructor.ReadCount( reader ) + 1;
if( instancesNumber < 0 )
{
throw new FormatException( "TermIndexRecord -- Illegal number of instances for a TermIndex record (" + instancesNumber + ") - possible index corruption" );
}
// NB: Discuss an OpenAPI issue for getting current maximal vlaue of document Id
// from the ResourceStore.
// if( new_.DocIndex >= 10000000 )
// throw( new IndexConstructor.TextIndexCorruption( "[DocIndex=" + new_.DocIndex + "] value in [TermIndex record Entry] is greater than a reasonable number of documents - possible index corruption" ));
//-----------------------------------------------------------------
try
{
if( new_.DocIndex != -1 )
{
InstanceOffset[] Offsets = new InstanceOffset[ instancesNumber ];
for( int j = 0; j < instancesNumber; j++ )
{
Offsets[ j ].Offset = reader.ReadUInt32();
Offsets[ j ].CompoundInfo = reader.ReadUInt32();
}
new_.Offsets = Offsets;
listTemporaryStorage.Add( new_ );
}
else
{
// this entry has been "removed", do not use in subsequent
// processing
new_ = null;
}
}
catch( OutOfMemoryException )
{
throw new FormatException( "TermIndexRecord - illegal number of term instances: [" + instancesNumber + "]" );
}
}
//-------------------------------------------------------------------------
// Assuming that caller has already set the necessay offset in the binary
// stream
//-------------------------------------------------------------------------
public void Save( BinaryWriter writer )
{
Debug.Assert( DocsNumber > 0 );
IndexConstructor.WriteCount( writer, HC );
//---------------------------------------------------------------------
for( int i = 0; i < DocsNumber; i++ )
{
Entry e = GetEntryAt( i );
IndexConstructor.WriteCount( writer, e.DocIndex );
writer.Write( e.TfIdf );
IndexConstructor.WriteCount( writer, e.Count - 1 ); // save count minus 1
foreach( InstanceOffset insoff in e.Offsets )
{
writer.Write( insoff.Offset );
writer.Write( insoff.CompoundInfo );
}
}
}
public void Compress()
{
ArrayList validEntries = new ArrayList();
for( int i = 0; i < DocsNumber; i++ )
{
Entry e = GetEntryAt( i );
if( e.DocIndex != -1 )
validEntries.Add( e );
}
Debug.Assert( validEntries.Count > 0, "After compression the number of valid entries must be positive" );
aEntries = (Entry[])validEntries.ToArray( typeof( Entry ));
}
public void PopulateRecordID( ushort termNumber )
{
for( int i = 0; i < DocsNumber; i++ )
{
for( int j = 0; j < aEntries[ i ].Count; j++ )
aEntries[ i ].Offsets[ j ].BaseID = termNumber;
}
}
public int DocsNumber { get{ return( (aEntries == null)? 0 : aEntries.Length ); } }
public Entry[] Entries
{
get{ return aEntries; }
set{ aEntries = value; }
}
public Entry GetEntryAt( int i_ )
{
Debug.Assert( i_ >= 0 && i_ < aEntries.Length );
return( aEntries[ i_ ] );
}
#region Attributes
public const int ciRecordPrologSize = 4 + 1 + 4 + 4;
public const int ciEntryPrologSize = 4 + 4 + 4;
public const int ciEntryDataSize = 4 + 4;
protected static ArrayList listTemporaryStorage = new ArrayList();
public int HC;
protected Entry[] aEntries;
// public int ChainingOffset;
// public ushort _termNumber;
// public int _chainsCount;
#endregion
}
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
public class Entry : IComparable
{
public Entry() { Proximity = EntryProximity.Document; }
//-------------------------------------------------------------------------
public int DocIndex
{
get{ return iDocIndex; }
set{ iDocIndex = value; }
}
public float TfIdf
{
get{ return fTfIdf; }
set{ fTfIdf = value; }
}
public InstanceOffset[] Offsets
{
get{ return( aInstances ); }
set{ aInstances = value; }
}
public int Count
{
get{ return aInstances.Length; }
}
public InstanceOffset Instance( int i_ )
{
Debug.Assert( aInstances != null );
Debug.Assert( i_ < aInstances.Length );
return( aInstances[ i_ ] );
}
public EntryProximity Proximity
{
set{ ResultProximity = value; }
get{ return ResultProximity; }
}
//-------------------------------------------------------------------------
// Define default sorting criterion - by Document ID
//-------------------------------------------------------------------------
int IComparable.CompareTo( object o )
{
Entry entry_ = (Entry)o;
if( iDocIndex < entry_.DocIndex )
return( -1 );
else
if( iDocIndex == entry_.DocIndex )
return( 0 );
else
return( 1 );
}
internal InstanceOffset[] FilterOffsetsBySection( uint sectionId )
{
ArrayList validOffsets = new ArrayList();
for( int i = 0; i < Count; i++ )
{
if( Instance( i ).SectionId == sectionId )
validOffsets.Add( Instance( i ) );
}
return (InstanceOffset[]) validOffsets.ToArray( typeof( InstanceOffset ) );
}
//-------------------------------------------------------------------------
#region Attributes
protected int iDocIndex;
protected float fTfIdf;
protected InstanceOffset[] aInstances;
protected EntryProximity ResultProximity;
#endregion
}
///
/// NB: method returns "inverted" value of comparison, so that standard
/// Array.Sort operation automatically orders the elements in the descending
/// order of metric value
///
public class CompareByTfIdf : IComparer
{
int IComparer.Compare( object left, object right )
{
if( ((Entry)left).TfIdf < ((Entry)right).TfIdf )
return( +1 );
else
if( ((Entry)left).TfIdf > ((Entry)right).TfIdf )
return( -1 );
else
return( 0 );
}
}
//-----------------------------------------------------------------------------
// Structure keeps basic data for a term in a document - doc index, relevance
// metric and instances offsets.
//-----------------------------------------------------------------------------
public struct InstanceOffset
{
public uint Offset { get{ return( iOffset ); } set{ iOffset = value; }}
public int OffsetNormal { get{ return (int)(iOffset & 0x00FFFFFF); } }
public uint CompoundInfo { get{ return iCompoundInfo; } set{ iCompoundInfo = value; }}
public ushort BaseID { get{ return( BaseTermID ); } set{ BaseTermID = value; }}
public ushort Sentence { get{ return (ushort) (iCompoundInfo & 0x0000FFFF); }}
public ushort TokenOrder { get{ return (ushort) ((iCompoundInfo & 0xFFFF0000) >> 16 ); }}
public uint SectionId { get{ return( iOffset & 0x1CFFFFFF) >> 26; }}
uint iOffset;
uint iCompoundInfo;
ushort BaseTermID;
}
}