///
/// Copyright © 2003-2008 JetBrains s.r.o.
/// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder).
///
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using JetBrains.Omea.OpenAPI;
namespace JetBrains.Omea.TextIndex
{
#region TreeStructureDescription
public class QueryParserNode
{
public enum Type { eoTerm, eoAnd, eoOr, eoNear, eoPhraseNear, eoNot, eoSection }
public QueryParserNode( Type eo_ ) { eoType = eo_; }
public Type NodeType { get{ return( eoType ); } }
protected Type eoType;
}
public class OpNode : QueryParserNode
{
public OpNode( Type eo_ ) : base( eo_ ) {}
public int BranchesNumber { get { return( aOperands.Count ); } }
public ArrayList Branches() { return aOperands; }
public QueryParserNode this[ int i_ ]
{
get
{
Debug.Assert( i_ < aOperands.Count );
return( (QueryParserNode)aOperands[ i_ ] );
}
}
public void AddOperand( QueryParserNode node )
{ aOperands.Add( node ); }
public void SetOperands( ArrayList operands )
{
aOperands = new ArrayList();
aOperands.AddRange( operands );
}
public void RemoveOperand( QueryParserNode node )
{
aOperands.Remove( node );
}
public EntryProximity RequiredProximity
{
get
{
if( eoType == Type.eoAnd )
return EntryProximity.Document;
if( eoType == Type.eoNear )
return EntryProximity.Sentence;
if( eoType == Type.eoPhraseNear )
return EntryProximity.Phrase;
throw new NotSupportedException( "OpNode -- Node of type [" + eoType + "] does not support proximity estimation." );
}
}
#region Attributes
protected ArrayList aOperands = new ArrayList();
#endregion Attributes
}
public class TermNode : QueryParserNode
{
public TermNode() : base( Type.eoTerm ) {}
public TermNode( string str ) : base( Type.eoTerm )
{
Term = str;
}
public string Term
{
set{ strValue = value; }
get{ return strValue; }
}
protected string strValue;
}
public class SectionNode : OpNode
{
public SectionNode() : base( Type.eoSection ) {}
public string SectionName { get; set; }
}
#endregion
//-------------------------------------------------------------------------
// Parse input query, build query tree, merge words corresponding to terms,
// optimize number of levels.
//-------------------------------------------------------------------------
public static class QueryParser
{
public static QueryPostfixForm ParseQuery( string query )
{
Initialize();
_query = query.Trim().ToLower();
QueryPostfixForm form;
try
{
QueryParserNode nodeRoot = ParseExpression();
QueryParserNode newRoot = null;
PropagateSectionOpInside( ref newRoot, nodeRoot );
if( newRoot == null )
newRoot = nodeRoot;
form = new QueryPostfixForm();
Tree2Postfix( newRoot, form );
}
catch( Exception )
{
form = null;
}
return form;
}
public static string Error { get{ return( _errorMessage ); } }
private static void Initialize()
{
_errorMessage = null;
iCurrentOffset = 0;
strToken = strPrevToken = "";
isPhrasalMode = false;
}
#region Parser
private static QueryParserNode ParseExpression()
{
QueryParserNode nodeLeftOperand = ProcessTerm();
GetNextToken();
//---------------------------------------------------------------------
// if next token == "" => EOS, no more processing, return left operand
// as result,
// if next token == ")" => we reached the endo of parenthed expression,
// return left operand as result, else process two operands and Op,
// return Op as root.
//---------------------------------------------------------------------
if(( strToken == "" ) || ( strToken == ")" ) || ( strToken == "+\"" ))
{
BackToken();
return( nodeLeftOperand );
}
QueryParserNode.Type typeNode;
if( !isPhrasalMode )
{
switch( strToken )
{
case "and": typeNode = QueryParserNode.Type.eoAnd; break;
case "or": typeNode = QueryParserNode.Type.eoOr; break;
case "near": typeNode = QueryParserNode.Type.eoNear; break;
default: typeNode = QueryParserNode.Type.eoAnd; BackToken(); break;
}
}
else
{
typeNode = QueryParserNode.Type.eoPhraseNear;
BackToken();
}
OpNode nodeOp = new OpNode( typeNode );
QueryParserNode nodeRightOperand = ParseExpression();
nodeOp.AddOperand( nodeLeftOperand );
nodeOp.AddOperand( nodeRightOperand );
return( nodeOp );
}
private static QueryParserNode ProcessTerm()
{
QueryParserNode nodeTerm;
QueryParserNode result = nodeTerm = ProcessPrimaryLevel();
GetNextToken();
if( strToken == "[" )
{
SectionNode nodeOp = new SectionNode();
GetNextToken();
nodeOp.SectionName = strToken;
nodeOp.AddOperand( nodeTerm );
_errorMessage = "Error in the query syntax - expected ']' symbol in document section specifier";
Expect( "]" );
_errorMessage = null;
result = nodeOp;
}
else
BackToken();
return( result );
}
private static QueryParserNode ProcessPrimaryLevel()
{
QueryParserNode nodePrimaryToken = null;
GetNextToken();
if( strToken != "" )
{
if( strToken == "(" )
{
nodePrimaryToken = ParseExpression();
_errorMessage = "Error in the query syntax - expected ')' symbol in expression";
Expect( ")" );
_errorMessage = null;
}
else
if( strToken == "\"+" )
{
isPhrasalMode = true;
nodePrimaryToken = ParseExpression();
Expect( "+\"" );
isPhrasalMode = false;
}
else
{
string normToken = OMEnv.LexemeConstructor.GetNormalizedToken( strToken );
if( isDelimitableToken( normToken ))
nodePrimaryToken = SplitTokenToTree( normToken );
else
nodePrimaryToken = new TermNode( normToken );
}
}
return( nodePrimaryToken );
}
#endregion Parser
#region TreeConverters
private static void PropagateSectionOpInside( ref QueryParserNode parent, QueryParserNode subtreeRoot )
{
if( subtreeRoot.NodeType == QueryParserNode.Type.eoTerm )
return;
if( subtreeRoot.NodeType == QueryParserNode.Type.eoSection )
{
Debug.Assert( ((OpNode)subtreeRoot).BranchesNumber == 1, "Illegal parsing of section operator" );
QueryParserNode child = ((OpNode)subtreeRoot)[ 0 ];
ArrayList branches;
// if section op is applied to the tree - perform conversion -
// propagate section op into every operand of the tree, and
// tree root becomes new local root.
if( child.NodeType != QueryParserNode.Type.eoTerm &&
child.NodeType != QueryParserNode.Type.eoSection )
{
//-------------------------------------------------------------
if( parent == null )
parent = child;
else
{
((OpNode)parent).RemoveOperand( subtreeRoot );
((OpNode)parent).AddOperand( child );
}
//-------------------------------------------------------------
branches = ((OpNode)child).Branches();
for( int i = 0; i < branches.Count; i++ )
{
SectionNode newSectionOp = new SectionNode();
newSectionOp.SectionName = ((SectionNode)subtreeRoot).SectionName;
newSectionOp.AddOperand( (QueryParserNode)branches[ i ] );
branches[ i ] = newSectionOp;
}
((OpNode)child).SetOperands( branches );
}
//-----------------------------------------------------------------
if( child.NodeType != QueryParserNode.Type.eoTerm )
{
branches = ((OpNode)child).Branches();
for( int i = 0; i < branches.Count; i++ )
PropagateSectionOpInside( ref child, (QueryParserNode)branches[ i ] );
}
}
}
private static void Tree2Postfix( QueryParserNode root, QueryPostfixForm form )
{
if( root == null )
return;
if( root.NodeType != QueryParserNode.Type.eoTerm )
{
foreach( QueryParserNode node in ((OpNode)root).Branches() )
Tree2Postfix( node, form );
}
form.Add( root );
}
private static QueryParserNode SplitTokenToTree( string token )
{
ArrayList subtokens = SplitTokens( token );
QueryParserNode node;
// 1. If there are no tokens - the input string completely consists
// of delimiter symbols.
// 2. If there is only one token - then delimiter symbols either started
// or closed the string.
if( subtokens.Count == 0 )
{
node = new TermNode( token );
}
else if( subtokens.Count == 1 )
{
// check if wildcard
if( token.EndsWith( "*" ) || token.EndsWith( "?" ) )
{
ArrayList tokens = Word.GetTokensByWildcard( token );
if( tokens.Count > 0 )
{
node = BuildTree( tokens, QueryParserNode.Type.eoOr );
}
else
{
node = new TermNode( token );
}
}
else
{
node = new TermNode( (string)subtokens[ 0 ] );
}
}
else
{
node = BuildTree( subtokens, QueryParserNode.Type.eoPhraseNear );
}
return node;
}
private static QueryParserNode BuildTree( IList subtokens, QueryParserNode.Type opCode )
{
QueryParserNode node = new TermNode( (string)subtokens[ subtokens.Count - 1 ] );
for( int i = subtokens.Count - 2; i >= 0; i-- )
{
TermNode leftOpnd = new TermNode( (string)subtokens[ i ] );
OpNode op = new OpNode( opCode );
op.AddOperand( leftOpnd );
op.AddOperand( node );
node = op;
}
return node;
}
private static ArrayList SplitTokens( string token )
{
int delimIndex;
ArrayList subtokens = new ArrayList();
do
{
delimIndex = GetDelimiterIndex( token );
if( delimIndex != -1 )
{
// Forbid using parentheses '(', ')', '[' and ']' as
// used non-intentionally.
char ch = token[ delimIndex ];
if(( delimIndex == 0 && isCloseBrace( ch )) ||
( delimIndex != 0 ) && ( isOpenBrace( ch ) || isCloseBrace( ch ) ))
{
_errorMessage = "Illegal characters met in the token \"" + token + "\"";
throw new Exception( "Illegal characters met in the token \"" + token + "\"" );
}
else
if( delimIndex != 0 )
{
subtokens.Add( token.Substring( 0, delimIndex ) );
}
token = token.Substring( delimIndex + 1 );
}
else
if( token.Length > 0 )
subtokens.Add( token );
}
while( delimIndex != -1 );
return subtokens;
}
#endregion TreeConverters
#region Tokenizer
private static void GetNextToken()
{
if( strPrevToken != "" )
{
strToken = strPrevToken;
strPrevToken = "";
}
else
{
if( iCurrentOffset == _query.Length )
{
strToken = "";
}
else
{
SkipWhitespace();
if( isParenthesisSymbol() )
{
strToken = _query.Substring( iCurrentOffset++, 1 );
//---------------------------------------------------------
// We have somehow to distinguish between opening and closing
// quote sign. Blank before or after it gives a tip.
//---------------------------------------------------------
if( strToken == "\"" )
{
if(( iCurrentOffset == _query.Length ) ||
( _query[ iCurrentOffset ] == ' ' ) || ( _query[ iCurrentOffset ] == ')' ))
strToken = "+\"";
else
strToken = "\"+";
}
}
else
{
int i_StartOffset = iCurrentOffset;
while(( iCurrentOffset < _query.Length ) &&
!isParenthesisSymbol() && _query[ iCurrentOffset ] != ' ' )
iCurrentOffset++;
if( iCurrentOffset == i_StartOffset ) // End Of String
strToken = "";
else
strToken = _query.Substring( i_StartOffset, iCurrentOffset - i_StartOffset );
}
}
}
return;
}
private static void BackToken()
{
Debug.Assert( strPrevToken == "", "Attempt to overvrite non-empty BackToken" );
strPrevToken = strToken;
}
private static void SkipWhitespace()
{
while(( iCurrentOffset < _query.Length ) && ( _query[ iCurrentOffset ] == ' ' ))
iCurrentOffset++;
}
private static void Expect( string str_ )
{
GetNextToken();
if( strToken != str_ )
{
throw( new Exception( "Illegal query format - expected token: '" + str_ + "'" ) );
}
}
private static bool isParenthesisSymbol()
{
Debug.Assert( iCurrentOffset < _query.Length );
bool isStart = iCurrentOffset == 0,
isFinish = iCurrentOffset == _query.Length - 1;
char ch_ = _query[ iCurrentOffset ];
char prevCh = Char.MinValue, nextCh = Char.MinValue;
if( iCurrentOffset > 0 )
prevCh = _query[ iCurrentOffset - 1 ];
if( iCurrentOffset < _query.Length - 1 )
nextCh = _query[ iCurrentOffset + 1 ];
return(( isOpenBrace( ch_ ) && ( isStart || ( prevCh == ' ' ) )) ||
( isCloseBrace( ch_ ) && ( isFinish || (nextCh == ' ' ) || (nextCh == ')' ) )) ||
( ch_ == '"' ) && ( isStart || ( prevCh == ' ' ) || ( prevCh == '(' ) ||
isFinish || ( nextCh == ' ' ) || ( nextCh == ')' ))) ;
}
private static bool isOpenBrace( char ch )
{
return ( ch == '[' ) || ( ch == '(' );
}
private static bool isCloseBrace( char ch )
{
return ( ch == ']' ) || ( ch == ')' ) || ( ch == '"' );
}
private static bool isDelimitableToken( string token )
{
return( GetDelimiterIndex( token ) != -1 );
}
private static int GetDelimiterIndex( string token )
{
for( int i = 0; i < token.Length; i++ )
{
if( TextDocParser.isDelimiter( token[ i ] ))
return i;
}
return -1;
}
#endregion
#region Attributes
private static int iCurrentOffset = 0;
private static string _query;
private static string strToken = "", strPrevToken = "";
private static bool isPhrasalMode = false;
private static string _errorMessage;
#endregion Attributes
}
public class QueryPostfixForm : List
{
private int _termNodesNumber;
public int TermNodesCount { get { return( _termNodesNumber ); } }
public void IncTermNodesCount() { _termNodesNumber++; }
public int PostfixCount { get { return( Count ); } }
public QueryPostfixForm()
{
_termNodesNumber = 0;
}
public new void Add( QueryParserNode node )
{
if( node is TermNode )
IncTermNodesCount();
base.Add( node );
}
}
}