///
/// Copyright © 2003-2008 JetBrains s.r.o.
/// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder).
///
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using JetBrains.DataStructures;
using JetBrains.Omea.OpenAPI;
using JetBrains.Omea.ResourceTools;
namespace JetBrains.Omea.TextIndex
{
///
/// Special fake class used to designate a special case of term - stopword.
/// An object of this class is pushed to stack instead of an index entry
/// in order to perform special processing of the subsequent operations.
///
internal class StopwordTerm {}
///
/// Process a query represented in a postfix form, transform query terms
/// to their document sets, and perform "anding" and "oring" of these sets.
/// Result set is the non-filtered result.
///
public class QueryProcessor
{
///
/// Query processor currently can return two statuses of errors:
/// 1. when input query has illegal syntax;
/// 2. when a document section with a given name does not exist.
///
public enum ErrorStatus { NoError, IllegalSectionName, IllegalQuerySyntax };
//-------------------------------------------------------------------------
public static ErrorStatus Status{ get{ return Error; } }
public static Entry[] ProcessQuery( QueryPostfixForm postfixForm, TermIndexAccessor termIndex )
{
return ProcessQuery( postfixForm, termIndex, false );
}
public static Entry[] ProcessQuery( QueryPostfixForm postfixForm, TermIndexAccessor termIndex, bool appendIdMappings )
{
Stack opStack = new Stack();
Entry[] result = null;
Error = ErrorStatus.NoError;
MappedInstances.Clear();
if( !appendIdMappings )
{
Lexemes.Clear();
Stopwords.Clear();
}
try
{
IteratePostfixExpression( postfixForm, termIndex, opStack );
//-----------------------------------------------------------------
// Now only one Entry[] must remain on the top of the stack. It may
// be null if no document correspond to the query
//-----------------------------------------------------------------
if( opStack.Count != 1 )
throw new ApplicationException( "QueryParser -- Illegal query statement found" );
if( !( opStack.Peek() is StopwordTerm ) )
{
result = ExtractOperandFromStack( opStack );
if( result != null )
Array.Sort( result, new CompareByTfIdf() );
}
}
catch( Exception exc )
{
Trace.WriteLine( "QueryProcessor -- exception [" + exc.Message + "] occured." );
// Exception is raised if the expression was constructed with
// the syntactic errors.
// Clear the stack and put special marker on the top of it
Error = ErrorStatus.IllegalQuerySyntax;
result = null;
}
opStack.Clear();
return( result );
}
private static void IteratePostfixExpression( IList postfixForm, TermIndexAccessor termIndex, Stack opStack )
{
for( int i = 0; i < postfixForm.Count; i++ )
{
QueryParserNode node = postfixForm[ i ];
if( node.NodeType == QueryParserNode.Type.eoTerm )
{
PushTermOnStack( ((TermNode)node).Term, termIndex, opStack );
}
else
if( node.NodeType == QueryParserNode.Type.eoSection )
{
UnarySectionOp( ((SectionNode)node).SectionName, opStack );
}
else
{
BinaryOp( node, opStack );
}
}
}
private static void PushTermOnStack( string term, TermIndexAccessor index, Stack opStack )
{
if( !FullTextIndexer.isValuableToken( term ) )
{
opStack.Push( new StopwordTerm() );
if( Stopwords.IndexOf( term ) == -1 )
Stopwords.Add( term );
}
else
{
TermIndexRecord record = index.GetRecord( term );
if( record != null )
{
int order = Lexemes.IndexOf( term );
if( order == -1 )
{
Lexemes.Add( term );
order = Lexemes.Count - 1;
}
record.PopulateRecordID( (ushort) order );
}
opStack.Push( record );
}
}
private static void BinaryOp( QueryParserNode node, Stack opStack )
{
#region Preconditions
if ( opStack.Count < 2 )
throw new ApplicationException( "QueryProcessor -- Insufficient number of operands in the operating stack" );
#endregion Preconditions
// First, check One or both arguments to be stopwords.
object o1 = opStack.Pop(), o2 = opStack.Peek();
opStack.Push( o1 );
// If both arguments are stopwords - push them both, leave the stopword
// sign as the result for subsequent calculations.
// If only one argument is stopword - leave its counterpart as the result.
if( o1 is StopwordTerm )
{
opStack.Pop();
}
else
if( o2 is StopwordTerm )
{
opStack.Pop(); // o1
opStack.Pop(); // o2
opStack.Push( o1 ); // o1 again instead of the couple.
}
// both arguments are normal terms which might be met in the index.
else
{
Entry[] result;
Entry[] rightIndices = ExtractOperandFromStack( opStack );
Entry[] leftIndices = ExtractOperandFromStack( opStack );
if( node.NodeType == QueryParserNode.Type.eoOr )
result = Join( leftIndices, rightIndices );
else
result = Intercross( leftIndices, rightIndices, ((OpNode)node).RequiredProximity );
opStack.Push( result );
}
}
#region Operators
//-------------------------------------------------------------------------
// "SECTION" Op
//-------------------------------------------------------------------------
private static void UnarySectionOp( string sectionName, Stack opStack )
{
if( ! ( opStack.Peek() is StopwordTerm ))
{
uint sectionId = 0;
if( DocSectionHelper.IsShortNameExist( sectionName ))
sectionId = DocSectionHelper.OrderByShortName( sectionName );
else
if( DocSectionHelper.IsFullNameExist( sectionName ))
sectionId = DocSectionHelper.OrderByFullName( sectionName );
if( sectionId > 0 )
{
Entry[] operand = ExtractOperandFromStack( opStack );
operand = SelectRestrictedEntries( operand, sectionId );
opStack.Push( operand );
}
else
Error = ErrorStatus.IllegalSectionName;
}
}
///
/// "And" and "Near" Ops. Particular behavior is determined by optional
/// predicate which may perform additional check of [intermediate] document,
/// feasible for inclusion into the result set
///
private static Entry[] Intercross( Entry[] leftIndices, Entry[] rightIndices,
EntryProximity reqProximity )
{
if(( leftIndices == null ) || ( rightIndices == null ))
return( null );
Array.Sort( leftIndices );
Array.Sort( rightIndices );
//---------------------------------------------------------------------
ArrayList array_ = new ArrayList();
int i_Left = 0, i_Right = 0;
while(( i_Left < leftIndices.Length ) && ( i_Right < rightIndices.Length ))
{
if( leftIndices[ i_Left ].DocIndex == rightIndices[ i_Right ].DocIndex )
{
EntryProximity scope = ProximityEstimator.EstimateProximity( leftIndices[ i_Left ], rightIndices[ i_Right ] );
if( scope <= reqProximity )
{
leftIndices[ i_Left ].Proximity = scope;
array_.Add( JoinInstancesOfEntries( leftIndices[ i_Left ], rightIndices[ i_Right ], reqProximity ));
}
i_Left++; i_Right++;
}
else
if( leftIndices[ i_Left ].DocIndex < rightIndices[ i_Right ].DocIndex )
i_Left++;
else
i_Right++;
}
//---------------------------------------------------------------------
Entry[] ai_Result = null;
if( array_.Count > 0 )
{
ai_Result = new Entry[ array_.Count ];
array_.CopyTo( ai_Result );
}
return( ai_Result );
}
//-------------------------------------------------------------------------
// "Or" Op
//-------------------------------------------------------------------------
private static Entry[] Join( Entry[] leftIndices, Entry[] rightIndices )
{
//---------------------------------------------------------------------
ArrayList array_ = new ArrayList();
if( leftIndices != null ) array_.AddRange( leftIndices );
if( rightIndices != null ) array_.AddRange( rightIndices );
array_.Sort();
//---------------------------------------------------------------------
int i = 0, count = array_.Count;
while( i < count - 1 )
{
if( ((Entry)array_[ i ]).DocIndex == ((Entry)array_[ i + 1 ]).DocIndex )
{
array_[ i ] = JoinInstancesOfEntries( (Entry)array_[ i ], (Entry)array_[ i + 1 ],
EntryProximity.Document );
array_.RemoveAt( i + 1 );
count = array_.Count;
}
i++;
}
//---------------------------------------------------------------------
Entry[] ai_Result = null;
if( array_.Count > 0 )
{
ai_Result = new Entry[ array_.Count ];
array_.CopyTo( ai_Result );
}
return( ai_Result );
}
#endregion
///
/// Extract operand from the top of the stack and cast it to the necessary
/// type (Entry[]) if necessary via extracting data from the record
///
private static Entry[] ExtractOperandFromStack( Stack opStack )
{
Object Operand = opStack.Pop();
Debug.Assert(( Operand is TermIndexRecord ) || ( Operand is Entry[] ) || ( Operand == null ));
if( Operand is TermIndexRecord )
return( ((TermIndexRecord)Operand).Entries );
else
return( (Entry[]) Operand );
}
//-------------------------------------------------------------------------
private static Entry JoinInstancesOfEntries( Entry left, Entry right,
EntryProximity requiredProximity )
{
Entry JoinedEntry = new Entry();
JoinedEntry.DocIndex = left.DocIndex;
JoinedEntry.TfIdf = left.TfIdf + right.TfIdf;
JoinedEntry.Proximity = left.Proximity;
InstanceOffset[] joinedOffsets;
// If required proximity is Phrasal, then we need to highlight
// only those terms and show only those contexts which correspond
// to seach term instances EXACTLY in phrases found, and not
// others located elsewhere in the document.
if( requiredProximity == EntryProximity.Phrase )
{
// Assumption is made that all offsets in the entries are
// sorted in asceding order.
ArrayList tempOffsets = new ArrayList();
int leftIndex = 0, rightIndex = 0;
while( leftIndex < left.Count && rightIndex < right.Count )
{
InstanceOffset leftOff = left.Offsets[ leftIndex ], rightOff = right.Offsets[ rightIndex ];
if( ProximityEstimator.isPhraseProximity( leftOff, rightOff ))
{
tempOffsets.Add( leftOff );
tempOffsets.Add( rightOff );
AddMappedInstances( tempOffsets, left.DocIndex, rightOff );
MappedInstances[ HC( left.DocIndex, leftOff.OffsetNormal ) ] = rightOff;
}
if( leftOff.OffsetNormal < rightOff.OffsetNormal )
leftIndex++;
else
rightIndex++;
}
joinedOffsets = (InstanceOffset[]) tempOffsets.ToArray( typeof( InstanceOffset ) );
}
else
{
joinedOffsets = new InstanceOffset[ left.Count + right.Count ];
left.Offsets.CopyTo( joinedOffsets, 0 );
right.Offsets.CopyTo( joinedOffsets, left.Count );
}
JoinedEntry.Offsets = joinedOffsets;
return( JoinedEntry );
}
private static void AddMappedInstances( IList tempOffsets, int docIndex, InstanceOffset inst )
{
long hashCode = HC( docIndex, inst.OffsetNormal );
object rightInst = MappedInstances[ hashCode ];
while( rightInst != null )
{
tempOffsets.Add( (InstanceOffset) rightInst );
hashCode = HC( docIndex, ((InstanceOffset) rightInst).OffsetNormal );
rightInst = MappedInstances[ hashCode ];
}
}
private static Entry[] SelectRestrictedEntries( IEnumerable entries, uint sectionId )
{
ArrayList result = new ArrayList();
if( entries != null )
{
foreach( Entry e in entries )
{
InstanceOffset[] validOffsets = e.FilterOffsetsBySection( sectionId );
if( validOffsets.Length > 0 )
{
e.Offsets = validOffsets;
result.Add( e );
}
}
}
return ( result.Count == 0 ) ? null : (Entry[]) result.ToArray( typeof( Entry ));
}
public static string[] LastStoplist { get { return (string[])Stopwords.ToArray( typeof( string )); } }
public static string[] LastSearchLexemes { get { return (string[])Lexemes.ToArray( typeof( string )); } }
private static long HC( int docIndex, int offset ) { return ((long)docIndex) << 32 | (uint)offset; }
//-------------------------------------------------------------------------
private static readonly ArrayList Lexemes = new ArrayList();
private static readonly ArrayList Stopwords = new ArrayList();
private static readonly Hashtable MappedInstances = new Hashtable();
private static ErrorStatus Error;
}
public class MatchProcessor
{
///
/// This local cache contains IDs of the terms which have been already queried
/// in the terms trie.
///
private static readonly Dictionary _termIDs = new Dictionary();
public static bool MatchQuery( QueryPostfixForm postfixForm, IntHashTable tokens )
{
Stack> opStack = new Stack>();
bool result;
try
{
IteratePostfixExpression( postfixForm, tokens, opStack );
if( opStack.Count != 1 )
throw new ApplicationException( "QueryParser -- Illegal query statement found" );
result = (opStack.Peek() != null);
}
catch( Exception exc )
{
Trace.WriteLine( "MatchProcessor -- exception [" + exc.Message + "] occured." );
result = true;
}
opStack.Clear();
return( result );
}
private static void IteratePostfixExpression( IList postfixForm, IntHashTable tokens, Stack> opStack )
{
for( int i = 0; i < postfixForm.Count; i++ )
{
QueryParserNode node = postfixForm[ i ];
switch( node.NodeType )
{
case QueryParserNode.Type.eoTerm:
PushTermOnStack( ((TermNode)node).Term, tokens, opStack ); break;
case QueryParserNode.Type.eoSection:
UnarySectionOp( ((SectionNode)node).SectionName, opStack ); break;
default:
BinaryOp( node, opStack ); break;
}
}
}
private static void PushTermOnStack( string term, IntHashTable tokens, Stack> opStack )
{
List resultVal = null;
if( FullTextIndexer.isValuableToken( term ) )
{
int HC;
// First check Id of the term in the local cache. Since the amount of
// query terms over all queries in the system is several tens (in average),
// the size of this cache is small enough. This cache allows not to
// consult terms trie each time.
if( !_termIDs.TryGetValue( term, out HC ) )
{
HC = Word.GetTokenIndex( term );
}
if( HC != -1 )
{
if( !_termIDs.ContainsKey( term ) )
_termIDs.Add( term, HC );
Object val = tokens[ HC ];
if( val != null )
{
resultVal = val as List;
if( resultVal == null )
{
resultVal = new List();
resultVal.Add( (long)val );
}
}
}
}
opStack.Push( resultVal );
}
private static void UnarySectionOp( string sectionName, Stack> opStack )
{
if( opStack.Peek() != null )
{
uint sectionId = 0;
if( DocSectionHelper.IsShortNameExist( sectionName ))
sectionId = DocSectionHelper.OrderByShortName( sectionName );
else
if( DocSectionHelper.IsFullNameExist( sectionName ))
sectionId = DocSectionHelper.OrderByFullName( sectionName );
if( sectionId > 0 )
{
List operand = opStack.Pop();
operand = SelectRestrictedMatchedEntries( operand, sectionId );
opStack.Push( operand );
}
}
}
private static List SelectRestrictedMatchedEntries( IEnumerable offsets, uint sectionId )
{
List result = new List();
foreach( long offset in offsets )
{
if( MaskEncoder.SectionId( offset ) == sectionId )
result.Add( offset );
}
return (result.Count == 0) ? null : result;
}
private static void BinaryOp( QueryParserNode node, Stack> opStack )
{
#region Preconditions
if ( opStack.Count < 2 )
throw new ApplicationException( "QueryProcessor -- Insufficient number of operands in the operating stack" );
#endregion Preconditions
List result;
List right = opStack.Pop(), left = opStack.Pop();
if( node.NodeType == QueryParserNode.Type.eoOr )
result = Join( left, right );
else
result = Intercross( left, right, ((OpNode)node).RequiredProximity );
opStack.Push( result );
}
private static List Intercross( List leftIndices, List rightIndices,
EntryProximity reqProximity )
{
if(( leftIndices == null ) || ( rightIndices == null ))
return( null );
leftIndices.Sort();
rightIndices.Sort();
List result = new List();
EntryProximity scope = ProximityEstimator.EstimateProximity( leftIndices, rightIndices );
if( scope <= reqProximity )
result.AddRange( JoinInstancesOfEntries( leftIndices, rightIndices, reqProximity ));
return (result.Count == 0) ? null : result;
}
private static List Join( IEnumerable leftIndices, IEnumerable rightIndices )
{
List result = new List();
if( leftIndices != null ) result.AddRange( leftIndices );
if( rightIndices != null ) result.AddRange( rightIndices );
result.Sort();
return (result.Count == 0) ? null : result;
}
private static List JoinInstancesOfEntries( List left, List right,
EntryProximity requiredProximity )
{
List joinedList = new List();
if( requiredProximity == EntryProximity.Phrase )
{
// Assumption is made that all offsets in the entries are
// sorted in asceding order.
int leftIndex = 0, rightIndex = 0;
while( leftIndex < left.Count && rightIndex < right.Count )
{
int order1 = MaskEncoder.TokenOrder( left[ leftIndex ] ),
order2 = MaskEncoder.TokenOrder( right[ rightIndex ] );
if( ProximityEstimator.isPhraseProximity( order1, order2 ))
{
joinedList.Add( left[ leftIndex ] );
joinedList.Add( right[ rightIndex ] );
}
if( order1 < order2 )
leftIndex++;
else
rightIndex++;
}
}
else
{
joinedList.AddRange( left );
joinedList.AddRange( right );
}
return( joinedList );
}
}
#region Proximity Estimator
//-----------------------------------------------------------------------------
// NB: It should be noted that this algorithm is almost optimal only with
// the assumption that all entry offsets are sorted!!!
//-----------------------------------------------------------------------------
internal class ProximityEstimator
{
internal static EntryProximity EstimateProximity( Entry Left, Entry Right )
{
Debug.Assert( Left.DocIndex == Right.DocIndex, "Illegal precondition for calling Estimator - doc IDs are different" );
int iLeft = 0, iRight = 0;
EntryProximity Result = EntryProximity.Document;
while(( iLeft < Left.Count ) && ( iRight < Right.Count ))
{
InstanceOffset leftOff = Left.Instance( iLeft );
InstanceOffset rightOff = Right.Instance( iRight );
if( leftOff.Sentence == rightOff.Sentence )
{
Result = EntryProximity.Sentence;
if( isPhraseProximity( leftOff, rightOff ) )
{
Result = EntryProximity.Phrase;
break;
}
}
if( leftOff.OffsetNormal < rightOff.OffsetNormal )
iLeft++;
else
iRight++;
}
return( Result );
}
internal static EntryProximity EstimateProximity( List Left, List Right )
{
int iLeft = 0, iRight = 0;
EntryProximity Result = EntryProximity.Document;
while(( iLeft < Left.Count ) && ( iRight < Right.Count ))
{
long leftOff = Left[ iLeft ];
long rightOff = Right[ iRight ];
if( MaskEncoder.Sentence( leftOff ) == MaskEncoder.Sentence( rightOff ) )
{
Result = EntryProximity.Sentence;
if( isPhraseProximity( MaskEncoder.TokenOrder( leftOff ), MaskEncoder.TokenOrder( rightOff ) ))
{
Result = EntryProximity.Phrase;
break;
}
}
if( MaskEncoder.OffsetNormal( leftOff ) < MaskEncoder.OffsetNormal( rightOff ) )
iLeft++;
else
iRight++;
}
return( Result );
}
internal static bool isPhraseProximity( InstanceOffset left, InstanceOffset right )
{
return (left.TokenOrder - right.TokenOrder) == -1;
}
internal static bool isPhraseProximity( int left, int right )
{
return (left - right) == -1;
}
}
#endregion Proximity Estimator
}