///
/// Copyright © 2003-2008 JetBrains s.r.o.
/// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder).
///
using System;
using System.Text;
using System.IO;
using System.Collections;
using System.Diagnostics;
using JetBrains.Omea.Base;
using JetBrains.DataStructures;
/*
DictionaryServer is a component to access string data by string Key.
DictionaryServer collects data from one or several plain text files
where lines are sorted case-insensitively (it is an obligatory condition,
otherwise DictionaryServer will not work properly) to make loading data
quicker. Each source text file can contain lines of two different formats:
1)
Search is performed only for presence of the given Key.
2) $
Search is performed for presence of the given Key and DataValue is
returned
Hierarchy of data access is the following:
Server->Sequence->Entry->(Key -> Value)
DictionaryServer performs proper data loading and supplies subcollections
of data - Sequencies. Sequence can create its own subsequences by any
string mask: including in the subsequnce all key-data pairs which keys
are started from the given mask.
Sequence consists of Entry pairs that represent Key and Value correspondence.
All keys over all sources *must* be unique; i.e. the given Entry
represents one source and can return TAG of this source.
Features
--------
1. DictionaryServer's search mechanism relies on the case-insensitive sort.
2. Case-insensitive sort is made by converting all characters in the
Key strings to the upper case (NB: in difference to the default C string
case-insensitive comparison, where all characters are converted to the
lower case before comparison).
Dependencies
------------
No external component is used.
*/
namespace JetBrains.Omea.TextIndex
{
public class DictionaryServer
{
//-------------------------------------------------------------------------
public DictionaryServer( string WordformsFileName, string unchangeablesFileName,
string stoplistFileName, params string[] dictionaries )
: this( dictionaries )
{
LoadTokenList( stoplistFileName, StopList );
LoadTokenList( unchangeablesFileName, Unchangeables );
ReconstructWordformsMappings( WordformsFileName );
}
private DictionaryServer( params string[] dictionaries )
{
/**
* try to load compiled dictionaries
*/
if( File.Exists( OMEnv.CompiledDicsFileName ) )
{
try
{
HashMap compiledDicNames = new HashMap();
FileStream compiled = IOTools.OpenRead( OMEnv.CompiledDicsFileName, 0x10000 );
using( compiled )
{
BinaryReader reader = new BinaryReader( compiled, Encoding.Unicode );
string name;
while( ( name = reader.ReadString() ).Length > 0 )
{
compiledDicNames[ name ] = reader.ReadInt64();
}
bool need2Rebuild = false;
// check whether all necessary dictionaries are compiled
foreach( string dictionary in dictionaries )
{
if( !compiledDicNames.Contains( dictionary ) ||
(long) compiledDicNames[ dictionary ] !=
IOTools.GetLastWriteTime( IOTools.GetFileInfo( dictionary ) ).Ticks )
{
need2Rebuild = true;
break;
}
}
if( !need2Rebuild )
{
int charCount = reader.ReadInt32();
DicBase = reader.ReadChars( charCount );
int intCount = reader.ReadInt32();
BaseIndices = new int[ intCount ];
for( int i = 0; i < intCount; ++i )
{
BaseIndices[ i ] = reader.ReadInt32();
}
if( BaseIndices.Length >= 2 )
{
LastWordCharIndex = BaseIndices[ BaseIndices.Length - 2 ];
FirstDicChar = DicBase[ 0 ];
LastDicChar = DicBase[ LastWordCharIndex ];
}
return;
}
}
}
catch {}
}
DefaultDictionariesLoading( dictionaries );
SaveCompiledDictionaries( dictionaries );
}
#region LoadAndNormalization
private void DefaultDictionariesLoading( string[] dictionaries )
{
foreach( string str_ in dictionaries )
{
try
{
using( StreamReader file_ = new StreamReader( str_ ) )
{
LoadDictionaryIntoPool( file_ );
}
}
catch( FileNotFoundException exc_ )
{
Trace.WriteLine( "Can not process dictionary file [" + str_ + "], reason - " + exc_.Message );
}
}
aUnitedDictionary.Sort( new StringStrictComparer() );
aUnitedDictionary.TrimToSize();
NormalizeDictionaryEntries();
CheckDuplicatesAndReductions();
ConvertArray2String();
aUnitedDictionary.Clear();
aUnitedDictionary = null;
}
/**
* save compiled dictionary
*/
private void SaveCompiledDictionaries( string[] dictionaries )
{
FileStream compiled = new FileStream( OMEnv.CompiledDicsFileName, FileMode.Create, FileAccess.Write );
using( compiled )
{
BinaryWriter writer = new BinaryWriter( compiled, Encoding.Unicode );
foreach( string dictionary in dictionaries )
{
writer.Write( dictionary );
writer.Write( IOTools.GetLastWriteTime( IOTools.GetFileInfo( dictionary ) ).Ticks );
}
writer.Write( string.Empty );
writer.Write( DicBase.Length );
writer.Write( DicBase );
writer.Write( BaseIndices.Length );
for( int i = 0 ; i < BaseIndices.Length; ++i )
writer.Write( BaseIndices[ i ] );
}
}
private void LoadTokenList( string fileName, HashSet storage )
{
if( File.Exists( fileName ) )
{
string str;
using( StreamReader file = new StreamReader( fileName ) )
{
while(( str = file.ReadLine()) != null )
{
storage.Add( str.Trim() );
}
}
}
}
//---------------------------------------------------------------------
///
/// Load new dictionary entries into the pool of other (possibly)
/// loaded entries.
/// Perform substitution of all blanks to "&" (which code is greater
/// than that of blank and "$" - delimiter between Key and DataValue.
///
//-------------------------------------------------------------------------
protected void LoadDictionaryIntoPool( StreamReader file_ )
{
string str_Buffer;
while(( str_Buffer = file_.ReadLine()) != null )
{
aUnitedDictionary.Add( str_Buffer.Replace( " ", "&" ).ToLower() );
}
}
//---------------------------------------------------------------------
///
/// Perform reverse replacing of '&' into blanks, so that LowerBound
/// can properly find sequences starting from the same words.
///
//---------------------------------------------------------------------
protected void NormalizeDictionaryEntries()
{
for( int i = 0; i < aUnitedDictionary.Count; i++ )
{
aUnitedDictionary[ i ] = ((string)aUnitedDictionary[ i ]).Replace( "&", " " );
}
}
//---------------------------------------------------------------------
///
/// - remove duplicated strings
/// - remove standalone lexemes which also have mapping variants (next
/// to them in the list).
///
//---------------------------------------------------------------------
protected void CheckDuplicatesAndReductions()
{
int i = 0;
while( i < aUnitedDictionary.Count - 1 )
{
if( (string)aUnitedDictionary[ i ] == (string)aUnitedDictionary[ i + 1 ] )
aUnitedDictionary.RemoveAt( i );
else
if( ((string)aUnitedDictionary[ i + 1 ]).StartsWith( (string)aUnitedDictionary[ i ] ) &&
((string)aUnitedDictionary[ i + 1 ]).Length > ((string)aUnitedDictionary[ i ]).Length &&
((string)aUnitedDictionary[ i + 1 ])[ ((string)aUnitedDictionary[ i ]).Length ] == '$' )
{
aUnitedDictionary.RemoveAt( i );
}
else
i++;
}
}
protected void ConvertArray2String()
{
// calc the size of the destination store
int totalByteSize = 0;
foreach( string str in aUnitedDictionary )
{
totalByteSize += str.Length;
}
totalByteSize += aUnitedDictionary.Count; // # of 0x00 bytes
// create linear structures
DicBase = new char[ totalByteSize ];
BaseIndices = new int[ aUnitedDictionary.Count + 1 ];
// copy string by string
int charIndex = 0;
for( int i = 0; i < aUnitedDictionary.Count; i++ )
{
string str = (string)aUnitedDictionary[ i ];
str.CopyTo( 0, DicBase, charIndex, str.Length );
BaseIndices[ i ] = charIndex;
charIndex += str.Length;
DicBase[ charIndex++ ] = (char)0x00;
}
// fake index, to simplify calculation of last string length
BaseIndices[ aUnitedDictionary.Count ] = charIndex;
LastWordCharIndex = BaseIndices[ aUnitedDictionary.Count - 1 ];
FirstDicChar = DicBase[ 0 ];
LastDicChar = DicBase[ LastWordCharIndex ];
}
#endregion
#region PredicateLookup
public bool isUnchangeable( string str )
{
return Unchangeables.Contains( str );
}
public bool isStopWord( string str )
{
return StopList.Contains( str );
}
///
/// Use BinarySearch on sorted sequence to find the lowerbound of possible
/// keys. Three cases are possible:
///
/// 1. pure [^Key$] is present in the sequence - BinarySearch >= 0 and points
/// to the position of Key in the sequence
///
/// 2. [^Key$Value$] is present in the sequence - BinarySearch less than 0
/// and points to the position of Key$Data in the sequence
///
/// 3. [^Key ...smth...$Value$] is present in the sequence - BinarySearch
/// less than 0 and points to the position of ^Key ...smth...$Value$
/// in the sequence.
///
///
/// true, if correponding dictionary entry (single or compound) was
/// found, false otherwise.
///
public bool FindLowerBound( string mask, out int baseIndex, out int length )
{
length = baseIndex = -1;
if( mask.Length > 0 )
{
if(( mask[ 0 ] >= FirstDicChar ) && ( mask[ 0 ] <= LastDicChar ))
{
baseIndex = BinarySearch( mask );
if( baseIndex >= 0 || isMaskAsProperPrefix( mask, -baseIndex ))
{
int absIndex = Math.Abs( baseIndex );
int baseOffset = BaseIndices[ absIndex ];
length = BaseIndices[ absIndex + 1 ] - baseOffset - 1;
baseIndex = (baseIndex > 0) ? baseOffset : -baseOffset;
}
else
baseIndex = -1;
}
}
return( baseIndex != -1 );
}
public int GetMapIndex( int start, int length )
{
Debug.Assert( start >= 0, "Start index is negative" );
Debug.Assert( start < DicBase.Length, "Start index is larger than dictionary base" );
Debug.Assert( start + length < DicBase.Length, "End index is larger than dictionary base" );
int endIndex = start + length;
for( int i = start + 1; i < endIndex; i++ )
{
if( DicBase[ i ] == '$' )
return i + 1;
}
return -1;
}
#endregion
#region Auxiliary
//-------------------------------------------------------------------------
// Use BinarySearch on sorted sequence to find the lowerbound of possible
// keys. Three cases are possible:
// 1. pure [^Key$] is present in the sequence - BinarySearch >= 0 and points
// to the position of Key in the sequence
// 2. [^Key$Value$] is present in the sequence - BinarySearch < 0 and points
// to the position of Key$Data in the sequence
// 3. [^Key ...smth...$Value$] is present in the sequence - BinarySearch < 0
// and points to the position of ^Key ...smth...$Value$ in the sequence.
//-------------------------------------------------------------------------
private int BinarySearch( string mask )
{
int left = 0, right = BaseIndices.Length - 2, middle = 0;
int compareStatus = 0;
while( left <= right )
{
middle = (left + right) / 2;
compareStatus = CompareStrings( mask, middle );
if( compareStatus < 0 )
right = middle - 1;
else
if( compareStatus > 0 )
left = middle + 1;
else
return middle;
}
return (compareStatus > 0) ? -left : -middle;
}
private int CompareStrings( string mask, int index )
{
int maskLen = mask.Length;
int dicLen = BaseIndices[ index + 1 ] - BaseIndices[ index ] - 1;
int minLen = Math.Min( maskLen, dicLen );
int baseIndex = BaseIndices[ index ];
for( int i = 0; i < minLen; i++ )
{
if( mask[ i ] < DicBase[ baseIndex + i ] )
return -1;
else
if( mask[ i ] > DicBase[ baseIndex + i ] )
return 1;
}
if( maskLen < dicLen )
return -maskLen;
else
if( maskLen > dicLen )
return dicLen;
else
return 0;
}
protected bool isMaskAsProperPrefix( string mask, int keyIndex )
{
Debug.Assert( keyIndex >= 0 && keyIndex < BaseIndices.Length );
if( keyIndex == BaseIndices.Length - 1 )
return false;
int maskLen = mask.Length;
int baseIndex = BaseIndices[ keyIndex ];
int keyLen = BaseIndices[ keyIndex + 1 ] - baseIndex - 1;
return(( keyLen > maskLen ) && ( DicBase[ baseIndex + maskLen ] == '$' ) &&
( CompareStrings( mask, keyIndex ) == -maskLen ));
}
public char GetChar( int offset )
{
Debug.Assert( offset >= 0 && offset < DicBase.Length );
return DicBase[ offset ];
}
public string GetDicString( int offsetIndex, int length )
{
Debug.Assert( offsetIndex >= 0 && offsetIndex < DicBase.Length );
return new String( DicBase, offsetIndex, length );
}
#endregion Auxiliary
#region Wordforms Processing
public int GetWordformVariant( string token )
{
HashMap.Entry e = WordformsVariant.GetEntry( token );
return (e != null) ? (int) e.Value : -1;
}
public int StoreMapping( string wordform, string lexeme )
{
Debug.Assert( !WordformsVariant.Contains( wordform ) );
int derivationVariant;
ArrayList forms;
HashMap.Entry e = WordformsValues.GetEntry( lexeme );
if( e != null )
{
forms = (ArrayList)e.Value;
}
else
{
forms = new ArrayList();
WordformsValues[ lexeme ] = forms;
}
forms.Add( wordform );
WordformsVariant[ wordform ] = derivationVariant = forms.Count;
WordformsChanged = true;
return( derivationVariant );
}
//---------------------------------------------------------------------
// NB: Exceptional conditions are possible if e.g. we failed to flush
// wordforms file into HD (due to any external conditions) and
// reread its previous state.
//---------------------------------------------------------------------
public string GetLexemeMapping( string lexeme, int mapVariant )
{
if( mapVariant <= 0 )
{
throw new ArgumentOutOfRangeException( "DictionaryServer -- Mapping variant parameter must be positive." );
}
HashMap.Entry e = WordformsValues.GetEntry( lexeme );
if( e == null )
{
throw new ArgumentException( "DictionaryServer -- Illegal call to GetMappingLexeme - lexeme [" +
lexeme + " is not present with index " + mapVariant );
}
ArrayList forms = (ArrayList) e.Value;
if( mapVariant > forms.Count )
{
throw new ArgumentOutOfRangeException( "DictionaryServer -- Mapping variant [" + mapVariant +
"] is greater than the number of possible wordforms " + forms.Count );
}
return (string)forms[ mapVariant - 1 ];
}
public void FlushWordforms( string fileName )
{
if( !WordformsChanged )
return;
try
{
StreamWriter writer = new StreamWriter( fileName );
foreach( HashMap.Entry e in WordformsValues )
{
ArrayList list = (ArrayList) e.Value;
writer.Write( e.Key + " - " );
for( int i = 0; i < list.Count; i++ )
{
writer.Write( (string) list[ i ] );
if( i != list.Count - 1 )
{
writer.Write( "|" );
}
}
writer.WriteLine();
}
writer.Close();
WordformsChanged = false;
}
catch( System.IO.IOException exc )
{
Trace.WriteLine( "DictionaryServer -- Did not manage to flush wordforms into standard file ["
+ fileName + "] due to the exception:");
Trace.WriteLine( exc.Message );
}
}
protected void ReconstructWordformsMappings( string WordformsFileName )
{
if( File.Exists( WordformsFileName ) )
{
StreamReader reader = new StreamReader( WordformsFileName );
string Buffer;
WordformsVariant.Clear();
WordformsValues.Clear();
while(( Buffer = reader.ReadLine()) != null )
{
int i_DelimiterIndex = Buffer.IndexOf( " - " );
if( i_DelimiterIndex == -1 )
{
// Wordforms index corrupted - no delimiter found
continue;
}
string Lexeme = Buffer.Substring( 0, i_DelimiterIndex );
Buffer = Buffer.Substring( i_DelimiterIndex + 3 );
string[] Wordforms = Buffer.Split( '|' );
ArrayList FormsForLexeme = new ArrayList();
for( int i = 0; i < Wordforms.Length; i++ )
{
// Forbid adding the same wordform twice - this can
// happen e.g. as the result of corruption or other
// internal bugs.
string wordform = Wordforms[ i ];
if( FormsForLexeme.IndexOf( wordform ) == -1 )
{
FormsForLexeme.Add( wordform );
WordformsVariant[ wordform ] = FormsForLexeme.Count;
}
}
WordformsValues[ Lexeme ] = FormsForLexeme;
}
reader.Close();
}
}
#endregion
//---------------------------------------------------------------------
//---------------------------------------------------------------------
protected ArrayList aUnitedDictionary = new ArrayList();
protected HashSet StopList = new HashSet();
protected HashSet Unchangeables = new HashSet();
protected HashMap WordformsVariant = new HashMap();
protected HashMap WordformsValues = new HashMap();
protected char[] DicBase;
protected int[] BaseIndices;
protected int LastWordCharIndex = -1;
protected char FirstDicChar, LastDicChar;
protected bool WordformsChanged = false;
}
}