///
/// Copyright © 2003-2008 JetBrains s.r.o.
/// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder).
///
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Text;
using JetBrains.Omea.Base;
using JetBrains.Omea.TextIndex;
using JetBrains.Omea.OpenAPI;
using JetBrains.Omea.Containers;
using JetBrains.DataStructures;
namespace JetBrains.Omea.TextIndex
{
/**
* Class that manages the construction of the full-text index.
*/
public class FullTextIndexer: IResourceTextConsumer
{
public FullTextIndexer()
{
theIndexer = this;
RegisterPropertyTypes();
IndexConstructor.WorkDir = OMEnv.WorkDir;
}
public static FullTextIndexer Instance { get{ return theIndexer; } }
public void RejectResult() {}
public TextRequestPurpose Purpose { get{ return TextRequestPurpose.Indexing; } }
public void Initialize()
{
#region Preconditions
if ( _textParser != null )
throw new InvalidOperationException( "FillTextIndexer.Initialize() is invoked twice" );
#endregion Preconditions
_textParser = new TextDocParser();
_suppTrace = Core.SettingStore.ReadBool( "TextIndexing", "SuppressTraces", false );
CleanIndexTempFiles();
LoadExistingIndices();
}
#region Index Cleaning and Loading on Startup
private static void CleanIndexTempFiles()
{
DeleteFile( OMEnv.TermIndexFileName + OMEnv.IncChunkExtension );
DeleteFile( OMEnv.TermIndexFileName + OMEnv.IncChunkExtension + OMEnv.HeaderExtension );
// Necessary for old-formatted files.
// New format does not use any temp files for doc index.
DeleteFile( "_doc.index.tmp" );
DeleteFile( "_doc.index" + OMEnv.IncChunkExtension );
DeleteFile( "_doc.index" + OMEnv.IncChunkExtension + OMEnv.HeaderExtension );
}
//---------------------------------------------------------------------
// Split loading of the text index into two phases: first, load main
// (and thus the largest) chunk of the index; then load smaller one.
// Since chunk mergings are most often done with the small chunk,
// we do not loose most of the information if the corruption has been
// made during these mergings.
//---------------------------------------------------------------------
private void LoadExistingIndices()
{
try
{
LoadTermIndex();
if( _needDiscard )
{
// Discard reopens structures.
DiscardTextIndex();
}
NotifyIndexLoaded();
Trace.WriteLineIf( !_suppTrace, "-- FullTextIndexer -- Text index is loaded or initialized" );
}
catch( FormatException )
{
Trace.WriteLineIf( !_suppTrace, "-- FullTextIndexer -- Found a corrupted index while loading accessors." );
DiscardTextIndex();
}
catch( IOException e )
{
Trace.WriteLine( e.Message );
Trace.WriteLineIf( !_suppTrace, "-- FullTextIndexer -- Serious IO Exception occured while loading accessors." );
DiscardTextIndex();
}
}
private void RerequestDocVersionsIndexing()
{
Trace.WriteLineIf( !_suppTrace, "-- FullTextIndexer -- " + _docVersionsToProcess.Count + " documents is rerequested for indexing" );
foreach( IntHashTableOfInt.Entry e in _docVersionsToProcess )
Core.TextIndexManager.QueryIndexing( e.Key );
_docVersionsToProcess.Clear();
}
#endregion Index Loading on Startup
#region Registration
private void RegisterPropertyTypes()
{
IPropTypeCollection props = Core.ResourceStore.PropTypes;
SearchRankPropId = props.Register( "SearchRank", PropDataType.Double, PropTypeFlags.Virtual );
props.RegisterDisplayName( SearchRankPropId, "Search Rank" );
SimilarityPropId = props.Register( "Similarity", PropDataType.Double, PropTypeFlags.Virtual );
ContextPropId = props.Register( "Context", PropDataType.String, PropTypeFlags.Virtual );
ProximityPropId = props.Register( "Proximity", PropDataType.Int, PropTypeFlags.Virtual );
ContextHighlightPropId = props.Register( "HighlightContext", PropDataType.String, PropTypeFlags.Virtual );
props.Register( DocumentSectionResource.SectionHelpDescription, PropDataType.String, PropTypeFlags.Internal );
props.Register( "SectionShortName", PropDataType.String, PropTypeFlags.Internal );
props.Register( "SectionOrder", PropDataType.Int, PropTypeFlags.Internal );
Core.ResourceStore.ResourceTypes.Register( DocumentSectionResource.DocSectionResName, "", ResourceTypeFlags.Internal | ResourceTypeFlags.NoIndex );
Core.ResourceStore.RegisterUniqueRestriction( DocumentSectionResource.DocSectionResName, Core.Props.Name );
RegisterDocumentSection( DocumentSection.BodySection, "Full content of the resource", null );
RegisterDocumentSection( DocumentSection.SubjectSection, "Describes subject (or heading, title) of the e-mail, article, etc", "SU" );
RegisterDocumentSection( DocumentSection.AnnotationSection, "A note added by way of comment or explanation", "AN" );
RegisterDocumentSection( DocumentSection.SourceSection, "Identifies a source of the resource - person, site, server, etc.", "SRC" );
RegisterIndexVersioningTypes();
}
private void RegisterIndexVersioningTypes()
{
IPropTypeCollection props = Core.ResourceStore.PropTypes;
_needDiscard = !File.Exists( OMEnv.TokenTreeFileName );
// _needDiscard = !props.Exist( "InTextIndex" ) || !File.Exists( OMEnv.TokenTreeFileName );
// DocInIndexProp = props.Register( "InTextIndex", PropDataType.Bool, PropTypeFlags.Internal );
// The resource (single and unique) of this type keeps the current version of the
// text index. It has the single property "TextIndexVersion" (see below).
Core.ResourceStore.ResourceTypes.Register( "TextIndexVersion", "Name",
ResourceTypeFlags.Internal | ResourceTypeFlags.NoIndex );
// This property keeps the current version of the text index (do not mix it with
// the version of text index format. Each time the index is rebuilt, the value of this
// property is increased by 1, thus invalidating all resource which reference to the
// older version of the index through their "InTextIndexVersion" property (see below).
TextIndexVersionProp = props.Register( "TextIndexVersion", PropDataType.Int, PropTypeFlags.Internal );
// Property "InTextIndexVersion" keeps the version of text index in which the
// resource was indexed. If the value of this property is less than current version of
// the index (in the "TextIndexVersion" property, see above), then this resource will be
// reindexed.
DocInVersionIndexProp = props.Register( "InTextIndexVersion", PropDataType.Int, PropTypeFlags.Internal );
// Delete the old property.
if( props.Exist( "InTextIndex" ))
props.Delete( props[ "InTextIndex" ].Id );
// Load the current version of the index. In the case of the very first loading
// (when switching to the new versioning scheme) version is set to 1 and written back
// to the ResourceStore.
IResourceList versions = Core.ResourceStore.FindResourcesWithProp( null, TextIndexVersionProp );
if( versions.Count == 0 )
{
_indexVersionRes = Core.ResourceStore.BeginNewResource( "TextIndexVersion" );
_indexVersionRes.SetProp( TextIndexVersionProp, 1 );
_indexVersionRes.EndUpdate();
}
else
{
_indexVersionRes = versions[ 0 ];
}
_indexVersion = _indexVersionRes.GetIntProp( TextIndexVersionProp );
}
private IResource RegisterDocumentSection( string sectionName )
{
return RegisterDocumentSection( sectionName, null, null );
}
private IResource RegisterDocumentSection( string sectionName, string description, string shortName )
{
int sectionNum;
IResource section = Core.ResourceStore.FindUniqueResource( DocumentSectionResource.DocSectionResName, Core.Props.Name, sectionName );
if( section == null )
{
sectionNum = Core.ResourceStore.GetAllResources( DocumentSectionResource.DocSectionResName ).Count;
ResourceProxy proxy = ResourceProxy.BeginNewResource( DocumentSectionResource.DocSectionResName );
proxy.BeginUpdate();
proxy.SetProp( "Name", sectionName );
proxy.SetProp( "SectionOrder", sectionNum );
if( String.IsNullOrEmpty( description ) )
proxy.SetProp( "SectionHelpDescription", description );
if( String.IsNullOrEmpty( shortName ) )
proxy.SetProp( "SectionShortName", shortName );
proxy.EndUpdate();
section = proxy.Resource;
}
else
sectionNum = section.GetIntProp( "SectionOrder" );
_sectionsMapping[ sectionName ] = sectionNum;
return section;
}
#endregion Registration
#region Document Scope
public void AddDocumentHeading( int docID, string text )
{
AddDocumentFragment( docID, text, DocumentSection.SubjectSection );
}
public void AddDocumentFragment( int docID, string text )
{
AddDocumentFragment( docID, text, DocumentSection.BodySection );
}
public void AddDocumentFragment( int docID, string text, string sectionName )
{
if( docID < 0 ) return; // secure against just deleted resources
if( text == null ) return; // secure against null(s).
// Ignore new versions of the same document which is under
// processing now, remember its Id and rerequest its indexing
// later when current version of the document will be already
// in the index. Such scheme significantly reduces the
// complexity of maintaining term and doc statistics.
if( _finishedDocsInBatch.ContainsKey( docID ))
{
Trace.WriteIf( !_suppTrace, "-- FullTextIndexer -- new version of doc " + docID );
IResource resource = Core.ResourceStore.TryLoadResource( docID );
if ( resource != null )
{
string type = " has come (" + resource.Type + ")";
Trace.WriteIf( !_suppTrace, type );
}
Trace.WriteLineIf( !_suppTrace, " " );
_docVersionsToProcess[ docID ] = 1;
}
else
{
uint sectionId = (uint) CheckSection( sectionName );
// finish with previous document, start new
if( docID != _lastDocID )
{
DocumentDone();
CheckPreviewSign( docID );
}
// deal with new fragment of the last document or a completely new doc.
ProcessDocument( docID, text, sectionId );
_lastDocID = docID;
}
}
// This method is called when text provider wants to submit new chunk
// of text of different section. Offset starts from the beginning for
// the more convenient processing of highlights during the search.
public void RestartOffsetCounting()
{
_textParser.FlushOffset();
}
public void IncrementOffset( int spacesAmount )
{
#region Preconditions
if ( spacesAmount < 0 )
throw new ArgumentOutOfRangeException( "spacesAmount", "FullTextIndexer -- Amount of spaces to be passed is negative." );
#endregion Preconditions
_textParser.IncrementOffset( spacesAmount );
}
private int CheckSection( string sectionName )
{
int secId;
HashMap.Entry entry = _sectionsMapping.GetEntry( sectionName );
if( entry != null ) {
secId = (int) entry.Value;
}
else
{
IResource section = RegisterDocumentSection( sectionName );
secId = section.GetIntProp( "SectionOrder" );
}
return secId;
}
private void CheckPreviewSign( int docId )
{
IResource res = Core.ResourceStore.TryLoadResource( docId );
_mustConstructPreview = (res != null) && res.HasProp( Core.Props.NeedPreview );
}
private void ProcessDocument( int docID, string text, uint sectionId )
{
#region Preconditions
if ( _textParser == null )
throw new ApplicationException( "FullTextIndexer -- Impossible situation when Parser is NULL" );
#endregion Preconditions
// Every new section is a logical delimitation. Thus even if it
// does not contain punctuational delimiter we can be sure that
// NEAR op searches will correctly separate tokens by this
// artificial border.
if( _prevSectionId != sectionId )
_textParser.IncrementSentence();
_prevSectionId = sectionId;
if( docID != _lastDocID )
_textParser.Init( text ); // allow ""
else
_textParser.Next( text );
if( _mustConstructPreview && _previewFragment.Length < _cPreviewSize )
{
_previewFragment.Append( text );
if( _previewFragment.Length > _cPreviewSize )
_previewFragment.Length = _cPreviewSize;
}
Word word = _textParser.getNextWord();
while( word.Tag != Word.TokenType.eoEOS )
{
if( isValuableToken( word ) )
{
if( sectionId > 0 )
word.SectionId = sectionId;
OMEnv.LexemeConstructor.NormalizeToken( word );
word.SetId();
LogTerm( word );
}
word = _textParser.getNextWord();
}
}
private void LogTerm( Word word )
{
int HC = word.HC;
IntHashTableOfInt.Entry e = _termCounterInDoc.GetEntry( HC );
//-----------------------------------------------------------------
// update term's count in this doc
//-----------------------------------------------------------------
int termFreq;
if( e == null )
termFreq = _termCounterInDoc[ HC ] = 1;
else
e.Value = termFreq = e.Value + 1;
// _termMaxFrequency is declared as ushort. And we artificially limit
// its upper value to some value (near to Uint16.MaxValue) to avoid
// integer overflow.
if( _termMaxFrequency < termFreq )
_termMaxFrequency = (ushort)Math.Min( termFreq, _ciMaxMeaningfulCount );
//-----------------------------------------------------------------
long mask = MaskEncoder.Mask( word.TokenOrder, word.SentenceNumber, word.StartOffset );
IntHashTable.Entry entry = _tokens.GetEntry( HC );
if( entry == null )
{
_tokens[ HC ] = mask;
}
else
{
List offsets = entry.Value as List;
if( offsets == null )
{
offsets = new List( 4 );
offsets.Add( (long) entry.Value );
entry.Value = offsets;
}
offsets.Add( mask );
}
}
private void DocumentDone()
{
if( _lastDocID != -1 )
{
_finishedDocsInBatch[ _lastDocID ] = _termMaxFrequency;
Trace.WriteLineIf( _termCounterInDoc.Count == 0, "-- FullTextIndexer - Skip empty document " + _lastDocID );
// Notify that new document is ready to be inserted to the text index.
// Event receivers can use the resource's data computed so far to
// precheck its internal conditions (e.g. whether resource is matched
// over some text condition, so that optimize text querying...)
if( _termCounterInDoc.Count > 0 && ResourceProcessed != null )
ResourceProcessed( _lastDocID, null );
ManageIndexChunk();
// Prepare data for the next document
_termCounterInDoc.Clear();
_termMaxFrequency = 0;
_tokens.Clear();
_previewFragment.Length = 0;
_mustConstructPreview = false;
Flush();
if( _lastCollectTick + 5000 < Environment.TickCount )
{
GC.Collect();
_lastCollectTick = Environment.TickCount;
}
}
}
#endregion Document Scope
#region EndBatchUpdate and Chunk Construction
private void Flush()
{
Word.FlushTermTrie();
_termsAccessor.Flush();
}
public void EndBatchUpdate()
{
DocumentDone();
PropagateIndexInformation();
Cleanup();
RerequestDocVersionsIndexing();
}
private void ManageIndexChunk()
{
if( _tokens.Count == 0 ) return;
try
{
IndexConstructor.FlushDocument( TermIndexAccessor, _lastDocID, _termMaxFrequency, _tokens );
IResource doc = Core.ResourceStore.TryLoadResource( _lastDocID );
if( doc != null )
{
#region Pending Data Update
_pendingLock.Enter();
try
{
_pendingAddends.Add( _lastDocID );
_pendingDeletions.Remove( _lastDocID );
}
finally
{
_pendingLock.Exit();
}
#endregion Pending Data Update
Core.ResourceAP.QueueJob( JobPriority.Immediate, _cJobName, new ResourceDelegate( SetIndexedProps ), doc );
}
}
catch( IOException e )
{
Trace.WriteLineIf( !_suppTrace, "-- FullTextIndexer -- Fatal IO Exception occured while constructing text index." );
DiscardTextIndex();
throw new IOException( "FullTextIndexer -- IO Exception in chunk construction", e );
}
}
///
/// Mark resource with a flag that it is now in the Text Index and assign
/// a preview fragment (if necessary).
///
private void SetIndexedProps( IResource res )
{
if( !res.IsDeleted )
{
res.SetProp( DocInVersionIndexProp, _indexVersion );
string preview = _previewFragment.ToString();
if( preview.Length > 0 )
res.SetProp( Core.Props.PreviewText, preview );
#region Pending Data Update
_pendingLock.Enter();
try
{
_pendingAddends.Remove( _lastDocID );
}
finally
{
_pendingLock.Exit();
}
#endregion Pending Data Update
}
}
///
/// Fire the event that new portion of documents is merged to major or
/// incremental index chunk so that:
///
/// - they are indexed successfully and there is no need to remember them
/// in "unflushed" pool;
///
/// - they are available for searching or search-dependent rules completion.
///
/// NB: though amount of submitted tokens may be zero, set of propagated
/// documents may be non-empty because of empty documents.
///
///
private void PropagateIndexInformation()
{
if( _finishedDocsInBatch.Count > 0 )
{
NotifyIndexLoaded();
IntArrayList newDocsInChunk = IntArrayListPool.Alloc();
try
{
foreach( IntHashTableOfInt.Entry e in _finishedDocsInBatch )
newDocsInChunk.Add( e.Key );
PropagateSearchableDocuments( newDocsInChunk );
}
finally
{
IntArrayListPool.Dispose( newDocsInChunk );
}
}
}
private void NotifyIndexLoaded()
{
if( !_notificationAlreadyDone && _termsAccessor.TermsNumber != 0 )
{
_notificationAlreadyDone = true;
if( IndexLoaded != null )
IndexLoaded( this, EventArgs.Empty );
}
}
private void Cleanup()
{
OMEnv.DictionaryServer.FlushWordforms( OMEnv.WordformsFileName );
_lastDocID = -1;
_finishedDocsInBatch.Clear();
}
#endregion EndBatchUpdate and Chunk Construction
#region IFullTextIndexer
public bool IsIndexPresent
{
get { return _notificationAlreadyDone; }
}
public bool IsDocumentPresent( int docID )
{
if( !IsIndexPresent) return false;
IResource doc = Core.ResourceStore.TryLoadResource( docID );
if( doc != null )
{
#region Pending Data Update
_pendingLock.Enter();
try
{
if( IsDocumentInCurrentTextIndex( doc ) )
{
return !_pendingDeletions.Contains( doc.Id );
}
return _pendingAddends.Contains( doc.Id );
}
finally
{
_pendingLock.Exit();
}
#endregion Pending Data Update
}
return false;
}
public bool IsDocumentPresentInternal( int docId )
{
IResource doc = Core.ResourceStore.TryLoadResource( docId );
if( doc != null )
{
#region Pending Data Update
_pendingLock.Enter();
try
{
if( IsDocumentInCurrentTextIndex( doc ) )
{
return !_pendingDeletions.Contains( doc.Id );
}
return _pendingAddends.Contains( doc.Id );
}
finally
{
_pendingLock.Exit();
}
#endregion Pending Data Update
}
return false;
}
public static bool IsDocumentInCurrentTextIndex( IResource doc )
{
// If a resource contains no such property, "GetIntProp" returns "0",
// and text index version starts from "1".
int indexVersion = doc.GetIntProp( DocInVersionIndexProp );
return (indexVersion == _indexVersion);
}
internal delegate void IntDelegate( int ind );
public void DeleteDocument( int docID )
{
#region Preconditions
if( !IsIndexPresent )
throw new ApplicationException( "Intermodule communication error - caller CAN NOT call this method when index is not present" );
#endregion Preconditions
#region Pending Data Update
_pendingLock.Enter();
try
{
_pendingDeletions.Add( docID );
_pendingAddends.Remove( docID );
}
finally
{
_pendingLock.Exit();
}
#endregion Pending Data Update
Core.ResourceAP.QueueJob( JobPriority.Immediate, "Marking document not present in text index",
new IntDelegate( MarkNotInIndex ), docID );
}
private void MarkNotInIndex( int id )
{
IResource doc = Core.ResourceStore.TryLoadResource( id );
if( doc != null )
{
doc.SetProp( DocInVersionIndexProp, 0 );
#region Pending Data Update
_pendingLock.Enter();
try
{
_pendingDeletions.Remove( id );
}
finally
{
_pendingLock.Exit();
}
#endregion Pending Data Update
}
}
#endregion
#region Query Processing
internal const int _MaxQueryTokenLength = 255;
public class QueryResult
{
public QueryResult()
{
IsSingularTerm = false;
Result = null;
}
public Entry[] Result;
public bool IsSingularTerm;
public string ErrorMessage;
}
public bool MatchQuery( string query, int resId, int dummy )
{
#region Preconditions
if( resId != _lastDocID )
throw new ArgumentException( "MatchQuery (FullTextIndexer) -- Input resource Id does not match internal data" );
#endregion Preconditions
bool result = false;
if( isValidQuery( query ) )
{
QueryPostfixForm form = QueryParser.ParseQuery( query );
if( form != null )
{
result = MatchProcessor.MatchQuery( form, _tokens );
}
}
return result;
}
public QueryResult ProcessQuery( string query, int dummy )
{
#region Preconditions
Debug.Assert( IsIndexPresent, "Intermodule communication error - caller CAN NOT call this method without opened text index" );
#endregion Preconditions
QueryResult qResult = PerformInitialSearch( query );
IntHashTable validEntries = CompressEntries( qResult.Result );
FillResult( qResult, validEntries );
Trace.WriteLineIf( !_suppTrace, "--- Query [" + query + "]: " + validEntries.Count + " hits found" );
return( qResult );
}
private QueryResult PerformInitialSearch( string query )
{
QueryResult qResult = new QueryResult();
// perform search only if input query string is of "reasonable" length
if( isValidQuery( query ) )
{
QueryPostfixForm form = QueryParser.ParseQuery( query );
if( form != null )
{
qResult.IsSingularTerm = (form.TermNodesCount == 1);
qResult.Result = QueryProcessor.ProcessQuery( form, TermIndexAccessor );
if( QueryProcessor.Status == QueryProcessor.ErrorStatus.IllegalSectionName )
{
qResult.ErrorMessage = "Illegal document section name specified. Please consult help file for valid secion names.";
}
}
else
{
qResult.ErrorMessage = QueryParser.Error;
}
}
return qResult;
}
///
/// Ensure that there will be no duplicated IDs - this is possible
/// when doc is removed from index and then inserted with the same ID.
/// Overwriting usually helps :))
///
private IntHashTable CompressEntries( IEnumerable result )
{
IntHashTable validEntries = new IntHashTable();
if( result != null )
{
foreach( Entry e in result ) // body's not optimal but compact
{
if ( IsDocumentPresent( e.DocIndex ) )
validEntries[ e.DocIndex ] = e;
}
}
return validEntries;
}
private static void FillResult( QueryResult qResult, IntHashTable validEntries )
{
qResult.Result = null;
if( validEntries.Count > 0 )
{
int index = 0;
qResult.Result = new Entry[ validEntries.Count ];
foreach( IntHashTable.Entry e in validEntries )
{
qResult.Result[ index++ ] = (Entry)e.Value;
}
}
}
///
/// This public method is designed for simplified processing and is used in tests.
///
public Entry[] ProcessQueryInternal( string query )
{
Entry[] resultEntries = null;
if( isValidQuery( query ) )
{
QueryPostfixForm form = QueryParser.ParseQuery( query );
resultEntries = QueryProcessor.ProcessQuery( form, TermIndexAccessor );
if( resultEntries != null )
{
ArrayList list = new ArrayList();
foreach( Entry e in resultEntries )
{
if( IsDocumentPresentInternal( e.DocIndex ))
list.Add( e );
}
resultEntries = (list.Count > 0) ? (Entry[])list.ToArray( typeof(Entry) ) : null;
}
}
return( resultEntries );
}
private static bool isValidQuery( string query )
{
return (query.Length < _MaxQueryTokenLength) || (query.IndexOf( ' ' ) != -1);
}
#endregion Query Processing
#region Accessors and Closers
private void LoadTermIndex()
{
Trace.WriteLineIf( !_suppTrace, "-- FullTextIndexer -- Started creating Accessor over [" + OMEnv.TermIndexFileName + "]" );
_termsAccessor = new TermIndexAccessor( OMEnv.TermIndexFileName );
_termsAccessor.Load();
Trace.WriteLineIf( !_suppTrace, "-- FullTextIndexer -- TermIndexAccessor loaded " + _termsAccessor.TermsNumber + " terms" );
}
public TermIndexAccessor TermIndexAccessor
{
get
{
#region Preconditions
if ( _termsAccessor == null )
throw new ApplicationException( "FullTextIndexer -- TermIndex loading conditions violation - Index accessor is not initialized" );
#endregion Preconditions
return _termsAccessor;
}
}
public void CloseIndices()
{
if( _termsAccessor != null )
{
Flush();
TermIndexAccessor.Close();
}
}
private delegate void SimpleDelegate();
public void DiscardTextIndex()
{
DiscardTextIndexImpl( true );
}
public void DiscardTextIndexImpl( bool reopenIndex )
{
#region Preconditions
if( _termsAccessor == null )
throw new ApplicationException( "FullTextIndexer -- TextIndexer is not initialized yet" );
#endregion Preconditions
Trace.WriteLineIf( !_suppTrace, "-- FullTextIndexer -- Discard Index is started." );
Cleanup();
_docVersionsToProcess.Clear();
_finishedDocsInBatch.Clear();
_termCounterInDoc.Clear();
_tokens.Clear();
CleanIndexTempFiles();
// Discard data on disk and reopen data structures anew if necessary
_termsAccessor.Discard();
if( reopenIndex )
{
LoadTermIndex();
}
Core.ResourceAP.RunJob( "Marking all documents not present in text index", new SimpleDelegate( IncrementIndexVersionProperty ) );
#region Pending Data Update
_pendingLock.Enter();
try
{
_pendingAddends.Clear();
_pendingDeletions.Clear();
}
finally
{
_pendingLock.Exit();
}
#endregion Pending Data Update
Trace.WriteLineIf( !_suppTrace, "-- FullTextIndexer -- Index has been discarded successfully." );
}
private static void IncrementIndexVersionProperty()
{
_indexVersion++;
_indexVersionRes.SetProp( TextIndexVersionProp, _indexVersion );
}
private static void DeleteFile( string fileName )
{
if( File.Exists( fileName )) File.Delete( fileName );
}
public void FlushIndices() {}
#endregion Accessors and Closers
#region Open methods for deep access
public object GetTermRecordMain( int HC )
{
return TermIndexAccessor.TermExist( HC ) ? TermIndexAccessor.GetRecordByHC( HC ) : null;
}
public object GetTermRecordMem( int HC )
{
return null;
}
public void TraceIndexPerformanceCounters()
{
Trace.WriteLine( "Term index: loaded " + TermIndexAccessor.LoadedRecords + " records" );
Trace.WriteLine( "Term index: saved " + TermIndexAccessor.SavedRecords + " records" );
}
#endregion Open methods for deep access
#region Auxiliaries
private static bool isValuableToken( Word word )
{
return isValuableToken( word.Token );
}
public static bool isValuableToken( string token )
{
return !OMEnv.DictionaryServer.isStopWord( token );
}
public void PropagateSearchableDocuments( IntArrayList newDocsInChunk )
{
if( NextUpdateFinished != null )
{
NextUpdateFinished( this, new DocsArrayArgs( newDocsInChunk.ToArray() ) );
}
}
#endregion
#region Attributes
///
/// Event is raised:
/// - when the complete text index is loaded into the accessor, or
/// - in he case of index loading error and its reconstruction, after
/// the first chunk was successfully processed.
///
public event EventHandler IndexLoaded;
private bool _notificationAlreadyDone;
public event EventHandler ResourceProcessed;
///
/// NextUpdateFinished is raised when [new] portion of documents is
/// converted to a index chunk, feasible for searching (it may be
/// main or incremental chunk).
///
public event UpdateFinishedEventHandler NextUpdateFinished;
//---------------------------------------------------------------------
private const int _ciMaxMeaningfulCount = 65000;
private const int _cPreviewSize = 120;
private const string _cJobName = "Marking document as present in text index";
private TextDocParser _textParser;
private TermIndexAccessor _termsAccessor;
private readonly IntHashTableOfInt _termCounterInDoc = new IntHashTableOfInt( 2000 );
private readonly IntHashTableOfInt _docVersionsToProcess = new IntHashTableOfInt();
private readonly IntHashTableOfInt _finishedDocsInBatch = new IntHashTableOfInt();
private readonly HashMap _sectionsMapping = new HashMap();
private ushort _termMaxFrequency = 0;
private readonly IntHashTable _tokens = new IntHashTable( 2000 );
private readonly IntHashSet _pendingAddends = new IntHashSet( 100 );
private readonly IntHashSet _pendingDeletions = new IntHashSet( 100 );
private SpinWaitLock _pendingLock = new SpinWaitLock();
public static bool _suppTrace;
private readonly StringBuilder _previewFragment = new StringBuilder();
private bool _mustConstructPreview;
private int _lastDocID = -1;
private uint _prevSectionId;
public static int SimilarityPropId, SearchRankPropId, ProximityPropId,
ContextPropId, ContextHighlightPropId; //, DocInIndexProp;
public static int TextIndexVersionProp, DocInVersionIndexProp;
private bool _needDiscard;
private int _lastCollectTick;
private static FullTextIndexer theIndexer;
private static IResource _indexVersionRes;
private static int _indexVersion;
#endregion Attributes
}
}