///
/// Copyright © 2003-2008 JetBrains s.r.o.
/// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder).
///
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;
using JetBrains.Omea.OpenAPI;
using JetBrains.Omea.ResourceTools;
using JetBrains.DataStructures;
namespace JetBrains.Omea.TextIndex
{
#region Filters/Comparers
internal class AnchorComparer : IComparer
{
public int Compare(object x, object y)
{
WordPtr inst1 = (WordPtr)x, inst2 = (WordPtr)y;
if( inst1.SectionId < inst2.SectionId )
return -1;
if( inst1.SectionId > inst2.SectionId )
return 1;
if( inst1.StartOffset < inst2.StartOffset )
return -1;
if( inst1.StartOffset > inst2.StartOffset )
return 1;
return 0;
}
}
#endregion Filters/Comparers
public class ContextCtor
{
#region Highlighting
public static void GetHighlightedTerms( Entry entry, string[] lexemes, out WordPtr[] anchors )
{
anchors = new WordPtr[ entry.Count ];
Trace.WriteLine( "HighlightTerms -- the following terms were processed for highlighting: " );
for( int i = 0; i < entry.Count; i++ )
{
InstanceOffset instance = entry.Instance( i );
uint offset = instance.Offset;
string Lexeme = lexemes[ instance.BaseID ];
anchors[ i ].Original = Lexeme;
anchors[ i ].Text = ReconstructWordform( offset, Lexeme, OMEnv.DictionaryServer );
anchors[ i ].StartOffset = instance.OffsetNormal;
anchors[ i ].SectionId = (int)instance.SectionId;
anchors[ i ].Section = DocSectionHelper.FullNameByOrder( instance.SectionId );
// trace section
Trace.WriteLine( " [" + anchors[ i ].Text + "] at " + instance.OffsetNormal +
", section " + anchors[ i ].Section + ", sentence " + instance.Sentence );
// end trace section
}
Array.Sort( anchors, new AnchorComparer() );
}
#endregion Highlighting
public static string GetContext( Entry termEntry, string[] lexemes, out ArrayList hgltPairs )
{
string context = cNoContextSign;
int contextsNumber = Math.Min( MinimalNumberOfContexts, termEntry.Count );
int[] shifts = new int[ termEntry.Count ];
hgltPairs = new ArrayList();
Collector.Init( termEntry.Offsets, shifts );
try
{
// it is possible situation when temporary file is removed
// during this processing.
IResource res = Core.ResourceStore.TryLoadResource( termEntry.DocIndex );
if( res != null )
{
Core.PluginLoader.InvokeResourceTextProviders( res, Collector );
if( Collector.Body.Length > 0 )
{
context = cFragmentsDelimiter;
int leftBorder = Int32.MaxValue, rightBorder = Int32.MinValue;
int prevContextLength = 0;
for( int i = 0; i < contextsNumber; i++ )
{
InstanceOffset instance = termEntry.Instance( i );
int origOffset = instance.OffsetNormal;
int offset = Collector.ConvertOffset( origOffset, instance.SectionId );
ArrayList delimiterOffsets = new ArrayList();
// workaround of possible invalid text body reconstruction
// by plugin, when search terms appear out of the text margins...
if( offset < Collector.Body.Length )
{
if( offset < leftBorder || offset > rightBorder )
{
leftBorder = Math.Max( 0, offset - cContextSideLength );
rightBorder = Math.Min( Collector.Body.Length - 1, offset + cContextSideLength );
TuneBorders( offset, Collector.Body, ref leftBorder, ref rightBorder );
string fragment = Collector.Body.Substring( leftBorder, rightBorder - leftBorder + 1 );
InsertSectionDelimiters( ref fragment, leftBorder, rightBorder, context.Length, delimiterOffsets );
prevContextLength = context.Length;
context += fragment + cFragmentsDelimiter;
}
else
if( contextsNumber < termEntry.Count )
contextsNumber++;
int startOffset = offset - leftBorder + prevContextLength;
string lexeme = lexemes[ instance.BaseID ];
lexeme = ReconstructWordform( instance.Offset, lexeme, OMEnv.DictionaryServer );
TuneOffsetByBorders( ref startOffset, delimiterOffsets );
hgltPairs.Add( new OffsetData( startOffset, lexeme.Length ));
}
}
context = context.Replace( "\r\n", " " );
context = context.Replace( "\n", " " );
context = context.Replace( "\r", " " );
context = context.Replace( "\t", " " );
Trace.WriteLine( "ContextExtractor -- context for [" + termEntry.DocIndex + "/" + res.Type + "] is [" + context + "]" );
foreach( OffsetData pair in hgltPairs )
{
if( pair.Start + pair.Length >= context.Length )
Trace.WriteLine( " highlight prefix of token [" + context.Substring( pair.Start ) + "]" );
else
Trace.WriteLine( " highlight token [" + context.Substring( pair.Start, pair.Length ) + "]" );
}
}
}
}
catch
{
// Here we catch exceptions described in the OM-10659, reason
// for which is still is not found. Just hide the bug.
}
return( context );
}
#region Aux
//---------------------------------------------------------------------
// Implemenet several simple heuristics for context aestheticising:
// 1. do not allow borders cross words.
// 2. align context along the sentence borders
//---------------------------------------------------------------------
private static void TuneBorders( int offset, string text, ref int leftBorder, ref int rightBorder )
{
//-- Preconditions ----------------------------------------------
if( offset < 0 )
throw new ArgumentException( "ContextConstructor -- Offset is negative - " + leftBorder + ":" + rightBorder + ":" + offset + ":" + text.Length );
if( leftBorder < 0 )
throw new ArgumentException( "ContextConstructor -- LeftBorder is non-positive - " + leftBorder + ":" + rightBorder + ":" + offset + ":" + text.Length );
if( rightBorder < 0 )
throw new ArgumentException( "ContextConstructor -- RightBorder is non-positive - " + leftBorder + ":" + rightBorder + ":" + offset + ":" + text.Length );
if( leftBorder >= text.Length )
throw new ArgumentException( "ContextConstructor -- LeftBorder is larger than text fragment - " + leftBorder + ":" + rightBorder + ":" + offset + ":" + text.Length );
if( rightBorder >= text.Length )
throw new ArgumentException( "ContextConstructor -- RightBorder is larger than text fragment - " + leftBorder + ":" + rightBorder + ":" + offset + ":" + text.Length );
if( leftBorder >= rightBorder )
throw new ArgumentException( "ContextConstructor -- LeftBorder is larger of RightBorder - " + leftBorder + ":" + rightBorder + ":" + offset + ":" + text.Length );
//-- End of Preconditions ---------------------------------------
int delimIndex;
if( leftBorder > 0 ) // do not touch if == 0.
{
delimIndex = text.IndexOf( ' ', leftBorder );
if(( delimIndex != -1 ) && ( delimIndex < offset - 1 )) // multiple blanks???
{
rightBorder = Math.Min( rightBorder + (delimIndex - leftBorder), text.Length - 1 );
leftBorder = delimIndex + 1;
}
}
delimIndex = text.LastIndexOf( ' ', rightBorder, rightBorder - offset );
if(( delimIndex != -1 ) && ( delimIndex - offset > cMinimalContextSideLength ))
{
rightBorder = delimIndex;
}
//-----------------------------------------------------------------
delimIndex = SentenceDelimiterIndex( text, leftBorder, offset - leftBorder );
if( delimIndex >= offset )
throw new ArgumentException( "ContextConstructor -- Invalid calculation of sentence border - " + leftBorder + ":" + rightBorder + ":" + offset + ":" + text.Length );
if(( delimIndex != -1 ) && ( delimIndex + 2 != offset ))
{
rightBorder = Math.Min( rightBorder + (delimIndex - leftBorder), text.Length - 1 );
leftBorder = delimIndex + 2;
}
if( rightBorder < 0 )
throw new ArgumentException( "ContextConstructor -- RightBorder is negative (second round) - " + leftBorder + ":" + rightBorder + ":" + offset + ":" + text.Length );
if( rightBorder >= text.Length )
throw new ArgumentException( "ContextConstructor -- RightBorder is larger than text fragment (second round) - " + leftBorder + ":" + rightBorder + ":" + offset + ":" + text.Length );
delimIndex = text.LastIndexOf( ' ', rightBorder, rightBorder - offset );
if(( delimIndex != -1 ) && ( delimIndex - offset > cMinimalContextSideLength ))
{
rightBorder = delimIndex;
}
if( rightBorder >= text.Length )
throw new ArgumentException( "ContextConstructor -- RightBorder (final) is larger than text fragment (second round) - " + leftBorder + ":" + rightBorder + ":" + offset + ":" + text.Length );
}
private static int SentenceDelimiterIndex( string text, int start, int length )
{
int index = Int32.MaxValue;
DelimiterIndex( ". ", text, start, ref length, ref index );
DelimiterIndex( ".\n", text, start, ref length, ref index );
DelimiterIndex( ".\r", text, start, ref length, ref index );
DelimiterIndex( "? ", text, start, ref length, ref index );
DelimiterIndex( "?\r", text, start, ref length, ref index );
DelimiterIndex( "?\n", text, start, ref length, ref index );
DelimiterIndex( "! ", text, start, ref length, ref index );
DelimiterIndex( "!\r", text, start, ref length, ref index );
DelimiterIndex( "!\n", text, start, ref length, ref index );
return( index == Int32.MaxValue ? -1 : index );
}
private static void DelimiterIndex( string fragment, string text, int start, ref int length, ref int index )
{
int delimIndex = text.IndexOf( fragment, start, length );
if( delimIndex != -1 )
{
index = Math.Min( index, delimIndex );
length = delimIndex - start;
}
}
private static void InsertSectionDelimiters( ref string text,
int leftBorder, int rightBorder,
int curLength, IList borders )
{
int shiftOffset = 0;
foreach( int border in Collector._sectionBorders )
{
if( border > leftBorder && border < rightBorder )
{
int offset = border - leftBorder + shiftOffset;
if( offset > text.Length )
throw new ArgumentException( "ContextCtor -- construction of a context string failed: offset is larger than the length. Sorry." );
text = text.Substring( 0, offset ) + cSectionsDelimiter +
text.Substring( offset );
shiftOffset += cSectionsDelimiter.Length;
borders.Add( offset + curLength );
}
}
}
private static void TuneOffsetByBorders( ref int startOffset, ArrayList delimiterOffsets )
{
foreach( int offset in delimiterOffsets )
{
if( offset <= startOffset )
startOffset += 2;
}
}
#endregion Aux
#region WordformReconstruction
//---------------------------------------------------------------------
// Using simple heuristics, reconstruct wordform (live form) from the
// lexeme and information, encoded in the bits of Offset
// - 100 - simple plural or 3rd person
// - 010 - participle I
// - 001 - continuous form
//---------------------------------------------------------------------
protected static string ReconstructWordform( uint Offset, string lexeme, DictionaryServer dicServer )
{
string Context = lexeme;
//-----------------------------------------------------------------
if( isPlural( Offset ))
{
if( lexeme.EndsWith( "y" ) )
Context = Context.Remove( lexeme.Length - 1, 1 ) + "ie";
Context += "s";
}
else
if( isPast( Offset ))
{
if( lexeme[ lexeme.Length - 1 ] == 'y' )
{
Context = Context.Remove( lexeme.Length - 1, 1 );
Context += 'i';
}
if( Context[ Context.Length - 1 ] != 'e' )
Context += 'e';
Context += 'd';
}
else
if( isContinuous( Offset ))
{
if( lexeme[ lexeme.Length - 1 ] == 'e' )
Context = Context.Remove( lexeme.Length - 1, 1 );
Context += "ing";
}
else
if( isWordformIndex( Offset ))
{
int index = RetrieveIndexFromBits( Offset );
//-------------------------------------------------------------
// NB: Exceptional conditions are possible if e.g. DictionaryServer
// failed to flush wordforms file into HD (due to any
// external conditions) and reread its previous state.
//-------------------------------------------------------------
try
{
Context = dicServer.GetLexemeMapping( lexeme, index );
}
catch( Exception exc )
{
Trace.WriteLine( "ContextConstructor -- Did not manage to find wordform mapping <" + index +
"> for lexeme [" + lexeme + "] due to the exception:" );
Trace.WriteLine( exc.Message );
Trace.WriteLine( "ContextConstructor -- The lexeme value is used by default as the Wordform." );
Context = lexeme;
}
}
return( Context );
}
protected static bool isPlural( uint Mask )
{ return((( Mask & 0x80000000 ) > 0 ) && (( Mask & 0x63000000 ) == 0)); }
protected static bool isPast( uint Mask )
{ return((( Mask & 0x40000000 ) > 0) && (( Mask & 0xA3000000 ) == 0)); }
protected static bool isContinuous( uint Mask )
{ return((( Mask & 0x20000000 ) > 0) && (( Mask & 0xC3000000 ) == 0)); }
protected static bool isWordformIndex( uint Mask )
{ return(( Mask & 0x03000000 ) > 0 ); }
protected static bool isSuffixedComma( uint Mask )
{ return((( Mask & 0x10000000 ) > 0) && (( Mask & 0x08000000 ) == 0)); }
protected static bool isSuffixedColon( uint Mask )
{ return((( Mask & 0x08000000 ) > 0) && (( Mask & 0x10000000 ) == 0)); }
protected static bool isLeftPar( uint Mask )
{ return(( Mask & 0x04000000 ) > 0 ); }
protected static bool isRightPar( uint Mask )
{ return(( Mask & 0x18000000 ) > 0 ); }
protected static int RetrieveIndexFromBits( uint Mask )
{
int Result = 0;
if(( Mask & 0x01000000 ) > 0 )
Result += 1;
if(( Mask & 0x02000000 ) > 0 )
Result += 2;
if(( Mask & 0x20000000 ) > 0 )
Result += 4;
if(( Mask & 0x40000000 ) > 0 )
Result += 8;
if(( Mask & 0x80000000 ) > 0 )
Result += 16;
return( Result );
}
#endregion
#region Attributes
public const string cFragmentsDelimiter = "...";
private static readonly string cNoContextSign = (char)(0x2015) + " no context " + (char)(0x2015);
private const int MinimalNumberOfContexts = 2;
private const int cContextSideLength = 36;
private const int cMinimalContextSideLength = 20;
private const string cSectionsDelimiter = "][";
private static readonly TextCollector Collector = new TextCollector();
#endregion Attributes
}
#region TextCollector
///
/// TextCollector is an implementation of IResourceTextConsumer interface,
/// which collects the complete text body of the resource for further
/// extraction of context substrings.
///
internal class TextCollector: IResourceTextConsumer
{
internal void Init( InstanceOffset[] offsets, int[] shifts )
{
RejectResult();
LastSection = "";
LastSectionRestartsOffset = 0;
// Shifts = shifts;
// Offsets = offsets;
SectionStartOffset.Clear();
_sectionBorders.Clear();
SavedNames.Clear();
}
internal string Body { get { return AccumulatedBody.ToString(); } }
internal static void Finished() {}
#region IResourceTextConsumer2 interface
public void AddDocumentHeading( int docID, string text )
{
AddDocumentFragment( docID, text, DocumentSection.SubjectSection );
}
public void AddDocumentFragment( int docID, string text )
{
AddDocumentFragment( docID, text, DocumentSection.BodySection );
}
public void AddDocumentFragment( int docID, string text, string sectionName )
{
if( !String.IsNullOrEmpty( text ) )
{
AnalyzeSectionBorder( sectionName );
AccumulatedBody.Append( text );
}
ResId = docID;
}
// As was agreed with HtmlParser, this method is called exclusively
// for skipping tag information. Since we have to show the text "nicely",
// we subst large amount of blanks with just one for aesteics.
public void IncrementOffset( int count )
{
for( int i = 0; i < count; i++ )
AccumulatedBody.Append( ' ' );
}
public void RestartOffsetCounting()
{
LastSectionRestartsOffset = AccumulatedBody.Length;
}
public void RejectResult()
{
AccumulatedBody.Length = 0;
if( AccumulatedBody.Capacity > 16384 )
{
AccumulatedBody.Capacity = 1024;
}
}
public TextRequestPurpose Purpose
{ get{ return TextRequestPurpose.ContextExtraction; } }
#endregion IResourceTextConsumer2 interface
//--------------------------------------------------------------------
#region Impl
internal int ConvertOffset( int offset, uint sectionId )
{
if( !SectionStartOffset.ContainsKey( (int)sectionId ) )
{
string msg = "Mismatch between section names in primary parsing and body extraction [" + sectionId +
"] on offset=[" + offset + "], resource type=[" + Core.ResourceStore.LoadResource( ResId ).Type + "];";
foreach( string str in SavedNames.Keys )
msg += " Saved section dump [" + str + "] with Id=" + (int)SavedNames[ str ];
IResourceList sections = Core.ResourceStore.GetAllResources( DocumentSectionResource.DocSectionResName );
foreach( IResource section in sections )
msg += " DocSection Dump: name=" + section.GetStringProp( "Name" ) + " with order=" + section.GetIntProp("SectionOrder");
throw new ApplicationException( "ContextConstruction -- " + msg );
}
return( offset + SectionStartOffset[ (int)sectionId ] );
}
private void AnalyzeSectionBorder( string sectionName )
{
if( sectionName != LastSection )
{
if( AccumulatedBody.Length > 0 )
_sectionBorders.Add( AccumulatedBody.Length );
int sectionId = (int)DocSectionHelper.OrderByFullName( sectionName );
if( !SectionStartOffset.ContainsKey( sectionId ))
SectionStartOffset[ sectionId ] = LastSectionRestartsOffset;
LastSection = sectionName;
LastSectionId = sectionId;
SavedNames[ sectionName ] = sectionId;
}
}
#endregion Impl
#region Attributes
private readonly StringBuilder AccumulatedBody = new StringBuilder();
private readonly IntHashTableOfInt SectionStartOffset = new IntHashTableOfInt();
private string LastSection = "";
private int LastSectionId = 0;
private int LastSectionRestartsOffset = 0;
public List _sectionBorders = new List();
private readonly Hashtable SavedNames = new Hashtable();
private int ResId;
#endregion Attributes
}
#endregion TextCollector
}