///
/// Copyright © 2003-2008 JetBrains s.r.o.
/// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder).
///
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.Diagnostics;
using JetBrains.Omea.Base;
namespace JetBrains.Omea.TextIndex
{
/******************************************************************************
Class defines major logic for scriptable morphological analysis:
1. Determine the possible hypothetical lexemes from the wordform given
and grammar rules.
2. Filter out those hypotheses which do not appear in the dictionary.
Comment: the order of the hypotheses which have passed the dictionary test
is not defined.
Class determines possible lexeme alternatives and their POS using set of
rule from the given grammar.
A rule has the following format:
=> {| }*
[word template] is unique across the grammar and has format "*XXXX",
where "XXXX" defines the word suffix to be analyzed.
[modifier] is in ['=', '~', '*'] :
'=' - the word is not changed
'~' - suffix is stripped out from the word
'*' - suffix is substituted by the symbols following after the '*'
******************************************************************************/
public class ScriptMorphoAnalyzer
{
//-------------------------------------------------------------------------
public ScriptMorphoAnalyzer( string GrammarFileName )
{
Root = new Node();
listLexemes = new ArrayList();
reg = new Regex( "^ *(.+?) *=> *(.+) *$" );
string content = Utils.ReadEncodedFile( GrammarFileName );
ParseGrammar( content );
}
#region Grammar Parsing
//-------------------------------------------------------------------------
// Each line in the grammar file is a separate rule describing a
// {word|ending}, except lines starting with ';'
//-------------------------------------------------------------------------
protected void ParseGrammar( string GrammarContent )
{
string[] all = GrammarContent.Split( '\n' );
for( int i = 0; i < all.Length; i++ )
{
string str = all[ i ].TrimEnd( '\n', '\r' );
if(( str.Length > 0 ) && ( str[ 0 ] != ';' ))
ParseLine( str );
}
}
protected void ParseGrammar( StreamReader stream_ )
{
string str;
while(( str = stream_.ReadLine() ) != null )
{
if(( str.Length > 0 ) && ( str[ 0 ] != ';' ))
ParseLine( str );
}
}
//-------------------------------------------------------------------------
// Split rules into two parts - wordform pattern and actions with their
// corresponding descriptions. Pattern and Action are delimited by "=>"
//-------------------------------------------------------------------------
protected void ParseLine( string str_ )
{
Match match = reg.Match( str_ );
if( match.Success )
{
//-----------------------------------------------------------------
string str_Pattern = match.Groups[ 1 ].Value,
str_Variants = match.Groups[ 2 ].Value;
int i_SymIndex = str_Pattern.Length - 1;
Node CurrentNode = Root;
Debug.Assert( i_SymIndex >= 0, "Pattern string has zero length - impossible to build tree" );
//-----------------------------------------------------------------
while(( i_SymIndex >= 0 ) && ( str_Pattern[ i_SymIndex ] != '*' ))
{
Node NextNode = CurrentNode.findNodeByBranch( str_Pattern[ i_SymIndex ] );
if( NextNode == null )
{
NextNode = new Node();
CurrentNode.AddNodeBySymbol( NextNode, str_Pattern[ i_SymIndex ] );
}
i_SymIndex--;
CurrentNode = NextNode;
}
CurrentNode.InitVariants( str_Variants,
( i_SymIndex < 0 )? Node.WhatToInit.WordEndList : Node.WhatToInit.WhildCardList );
}
}
#endregion Grammar Parsing
#region High-level logic
public ArrayList GetLexemes( string wordForm, out bool isPerfectMatch )
{
int suffixLen;
string[] hypotheses = FindMatch( wordForm, out suffixLen, out isPerfectMatch );
listLexemes.Clear();
if( hypotheses != null )
{
// If we have a perfect match, that is the whole word is
// substituted with another one (or there may be two orthographic
// variants, but here we do not matter), we set a flag that we
// do not need to perform a lookup of this word in the dictionary
// later.
if( isPerfectMatch )
{
listLexemes.Add( hypotheses[ 0 ] );
}
else
{
System.Collections.IEnumerator enum_ = hypotheses.GetEnumerator();
while ( enum_.MoveNext() )
{
string str_ = TranslateModifier( wordForm, (string)enum_.Current, suffixLen );
if( ! listLexemes.Contains( str_ ))
listLexemes.Add( str_ );
}
}
}
return( listLexemes );
}
#endregion High-level logic
#region Suffix tree traversal
//-------------------------------------------------------------------------
// Walk along the suffix tree in the reversed order of the given wordform
//-------------------------------------------------------------------------
public string[] FindMatch( string wordForm,
out int matchedSuffixLength,
out bool isPerfectMatch )
{
int i_LastSym = wordForm.Length - 1;
int i_SuffixLength = 0, lastVisitedSuffixLength = 0;
Node CurrentNode = Root;
string[] ResultList = null;
//---------------------------------------------------------------------
matchedSuffixLength = 0;
isPerfectMatch = false;
while( i_LastSym >= 0 )
{
Node NextNode = CurrentNode.findNodeByBranch( wordForm[ i_LastSym ] );
//-----------------------------------------------------------------
// If there is no continuation for the word suffix - check whether
// last node (CurrentNode) has rules for suffix processing,
// otherwise return reference to rules which process lesser suffix
//-----------------------------------------------------------------
if( NextNode == null )
{
if( CurrentNode.aWordEnds != null )
{
matchedSuffixLength = lastVisitedSuffixLength;
ResultList = CurrentNode.aWordEnds;
}
else
if( CurrentNode.aWildCards != null )
{
matchedSuffixLength = i_SuffixLength;
ResultList = CurrentNode.aWildCards;
}
else
// if ResultList == null, then this assignment is just useless.
matchedSuffixLength = lastVisitedSuffixLength;
return( ResultList );
}
//-----------------------------------------------------------------
// Remember rule variants for the current suffix length so that
// return alternatives if no more narrow rule wont be found.
//-----------------------------------------------------------------
if( CurrentNode.aWildCards != null )
{
ResultList = CurrentNode.aWildCards;
lastVisitedSuffixLength = i_SuffixLength;
}
//-----------------------------------------------------------------
i_LastSym--;
i_SuffixLength++;
CurrentNode = NextNode;
}
//---------------------------------------------------------------------
// Wordform is shorter than rules which process its suffix
//---------------------------------------------------------------------
if( CurrentNode.aWordEnds != null )
{
matchedSuffixLength = i_SuffixLength;
ResultList = CurrentNode.aWordEnds;
isPerfectMatch = true;
}
else
if( ResultList != null )
{
matchedSuffixLength = lastVisitedSuffixLength;
}
return( ResultList );
}
#endregion Suffix tree traversal
//-------------------------------------------------------------------------
// 1. If Modifier == '=' then wordform is not to be changed
// 2. If Modifier == '~' suffix of specified length must be removed
// 3. Otherwise, suffix of specified length must be removed, and Modifier
// is appended to the rest.
//-------------------------------------------------------------------------
private string TranslateModifier( string wordForm, string action, int suffixLen )
{
string result;
switch( action )
{
case "=":
result = wordForm;
break;
case "~":
Debug.Assert( suffixLen <= wordForm.Length );
result = wordForm.Remove( wordForm.Length - suffixLen, suffixLen );
break;
default:
Debug.Assert( suffixLen <= wordForm.Length );
result = wordForm.Remove( wordForm.Length - suffixLen, suffixLen );
result += action;
break;
}
return( result );
}
//-------------------------------------------------------------------------
private ArrayList listLexemes;
private Node Root;
private Regex reg;
}
}