/// /// Copyright © 2003-2008 JetBrains s.r.o. /// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder). /// using System; using System.Collections; using System.Diagnostics; using System.IO; using System.Text; using JetBrains.Omea.Base; namespace JetBrains.Omea.Containers { /// /// Trie of chars. /// Each node contains sorted by char list of subnodes. /// Chars are compared via comparer passed to constructor. /// public class CharTrie : IEnumerable { /// /// Trie node /// If necessary, classes that inherit CharTrie can define specific nodes. /// To do this, inherit Node, define constructors and override NewNode() /// public class Node : IEnumerable { protected char _char; protected ArrayList _subNodes; protected static Node _compareNode = new Node( '\0', 0 ); #region IEnumerable implementation internal class SubNodesEnumerator : IEnumerator { public SubNodesEnumerator( Node root ) { _root = root; Reset(); } public void Reset() { _current = _root; _nodeStack.Clear(); _currentIndex = 0; } public object Current { get { return _current; } } public bool MoveNext() { for( ; ; ) { if( _current._subNodes != null && _currentIndex < _current._subNodes.Count ) { _nodeStack.Push( _current ); _indexStack.Push( _currentIndex ); _current = ( Node ) _current._subNodes[ _currentIndex ]; _currentIndex = 0; return true; } if( _indexStack.Count == 0 ) return false; _current = ( Node )_nodeStack.Pop(); _currentIndex = ( int ) _indexStack.Pop(); ++_currentIndex; } } private Node _root; private Node _current; private int _currentIndex; private Stack _nodeStack = new Stack(); private Stack _indexStack = new Stack(); } public IEnumerator GetEnumerator() { return new SubNodesEnumerator( this ); } #endregion /** * generic nodes' comparer is tuned by char comparer */ public class Comparer : IComparer { private readonly IComparer _charComparer; public Comparer( IComparer charComparer ) { _charComparer = charComparer; } public int Compare( object x, object y ) { Node nodex = ( Node ) x; Node nodey = ( Node ) y; return ( _charComparer == null ) ? nodex._char - nodey._char : _charComparer.Compare( nodex._char, nodey._char ); } } /** * Default nodes' comparer */ public class DefaultComparer : IComparer { private static readonly IComparer _instance = new DefaultComparer(); private DefaultComparer() {} public int Compare( object x, object y ) { Node nodex = ( Node ) x; Node nodey = ( Node ) y; return nodex._char - nodey._char; } public static IComparer Instance { get { return _instance; } } } public Node( char aChar ) { _char = aChar; } public Node( char aChar, int subNodesCapacity ) { _char = aChar; if( subNodesCapacity > 0 ) _subNodes = new ArrayList( subNodesCapacity ); } public virtual Node NewNode( char aChar, Node parent, object context ) { return new Node( aChar ); } public Node SubNode( char aChar, IComparer nodeComparer ) { if( _subNodes == null ) return null; _compareNode._char = aChar; int index = _subNodes.BinarySearch( _compareNode, nodeComparer ); return ( index < 0 ) ? null : (( Node ) _subNodes[ index ] ); } public Node Insert( char aChar, IComparer nodeComparer ) { if( _subNodes == null ) { _subNodes = new ArrayList( 1 ); Node newNode = NewNode( aChar, this, null ); _subNodes.Add( newNode ); return newNode; } else { _compareNode._char = aChar; int index = _subNodes.BinarySearch( _compareNode, nodeComparer ); if( index < 0 ) { index = ~index; _subNodes.Insert( index, NewNode( aChar, this, null ) ); } return ( Node ) _subNodes[ index ]; } } /// /// Provides access to the char value of this node. Used primarily for comparison needs. /// public char Value { get { return _char; } } #region saving/loading to/from a binary stream internal virtual void Save( BinaryWriter stream ) { stream.Write( _char ); int subNodes = ( _subNodes == null ) ? 0 : _subNodes.Count; stream.Write( (ushort)subNodes ); if( subNodes > 0 ) { foreach( Node node in _subNodes ) node.Save( stream ); } } internal virtual void Load( BinaryReader stream ) { _char = stream.ReadChar(); int subNodes = ( int )stream.ReadUInt16(); if( subNodes > 0 ) { _subNodes = new ArrayList( subNodes ); for (int i = 0 ; i < subNodes; ++i ) { Node newNode = NewNode( '\0', this, null ); newNode.Load( stream ); _subNodes.Add( newNode ); } } } #endregion } public CharTrie( IComparer charComparer ) { _nodeComparer = (charComparer != null) ? new Node.Comparer( charComparer ) : Node.DefaultComparer.Instance; Clear(); } public void Clear() { _root = new Node( '\0', 128 ); } public Node Root { get { return _root; } } public Node Add( string str ) { Node currNode = _root; for( int i = 0; i < str.Length; ++i ) currNode = currNode.Insert( str[ i ], _nodeComparer ); return currNode; } public Node Add( string str, int index, int count ) { Node currNode = _root; for( int i = index; count > 0 && i < str.Length; ++i, --count ) currNode = currNode.Insert( str[ i ], _nodeComparer ); return currNode; } /** * returns length of the largest string in the trie that matches the given */ public int GetMatchingLength( string str ) { Node currNode = _root; int i = 0; for( ; i < str.Length; ++i ) if( ( currNode = currNode.SubNode( str[ i ], _nodeComparer )) == null ) break; return i; } public int GetMatchingLength( string str, int index, int count ) { Node currNode = _root; int i = index; for( ; count > 0 && i < str.Length; ++i, --count ) if( ( currNode = currNode.SubNode( str[ i ], _nodeComparer )) == null ) break; return i - index; } /** * returns node of the largest string in the trie that matches the given */ public Node GetMatchingNode( string str ) { return GetMatchingNode( str, 0, str.Length ); } public Node GetMatchingNode( string str, int index, int count ) { Node currNode = _root; int i = index; for( ; count > 0 && i < str.Length; ++i, --count ) { Node nextNode = currNode.SubNode( str[ i ], _nodeComparer ); if( nextNode == null ) break; currNode = nextNode; } return currNode; } #region IEnumerable implementation public IEnumerator GetEnumerator() { return new Node.SubNodesEnumerator( _root ); } #endregion #region saving/loading to/from a binary stream public void Save( BinaryWriter stream ) { _root.Save( stream ); } public void Load( BinaryReader stream ) { Clear(); _root.Load( stream ); } #endregion protected internal IComparer _nodeComparer; protected internal Node _root; } /** * Trie of bytes located in external memory */ public class ExternalTrie: IDisposable { public const int _recordSize = 13; public const int _nodesCacheSize = 4095; public const int _stringCachesize = 2047; public ExternalTrie( string fileName ) { Init( fileName ); } public ExternalTrie( string fileName, ICachingStrategy strategy ) { _cachingStrategy = strategy; Init( fileName ); } public string FileName { get { return _fileName; } } public bool IsTrieComplete { get { return _isTrieComplete; } } public bool IsDirty { get { return _isDirty; } } public int FirstFreeIndex { get { return (int) ( _storage.Length / _recordSize ); } } public int NodesCacheSize { get { return _nodesCache.Size; } set { _nodesCache = new IntObjectCache( value ); } } public void Flush() { if( _isDirty ) { _storage.Flush(); _isDirty = false; } } /** * returns true if the string was actually added * otherwise the string is already located the external trie */ public bool AddString( string str, out int resultIndex ) { resultIndex = -1; if( str.Length == 0 ) { return false; } object key = _stringCache.TryKey( str ); if( key != null ) { resultIndex = (int) key; return true; } byte[] strBytes = Encoding.UTF8.GetBytes( str ); int lastIndex = strBytes[ 0 ]; TrieNode lastNode = _firstByteNodes[ lastIndex ]; bool result = false; for( int i = 1; i < strBytes.Length; ++i ) { byte currentByte = strBytes[ i ]; int index = lastNode._firstChild; TrieNode currentNode; for( ; index != 0; ) { currentNode = LoadNodeByIndex( index ); if( currentNode == null ) { index = 0; break; } if( currentNode._byte == currentByte ) { lastNode = currentNode; lastIndex = index; break; } index = currentNode._sibling; } if( index == 0 ) { TrieNode newNode = AllocNode(); newNode._byte = currentByte; newNode._firstChild = 0; newNode._sibling = lastNode._firstChild; newNode.Parent = lastIndex; newNode.IsToken = (i == strBytes.Length - 1); int newIndex = SaveNode2End( newNode ); lastNode._firstChild = newIndex; SaveNodeByIndex( lastNode, lastIndex ); lastNode = newNode; lastIndex = newIndex; result = true; } } resultIndex = lastIndex; _stringCache.CacheObject( str, lastIndex ); return result; } /** * gets index of a string in the external trie * if index is more than or equal to zero, the string is located in the trie */ public int GetStringIndex( string str ) { if( str.Length == 0 ) { return -1; } object key = _stringCache.TryKey( str ); if( key != null ) { return (int) key; } byte[] strBytes = Encoding.UTF8.GetBytes( str ); int lastIndex = strBytes[ 0 ]; TrieNode lastNode = _firstByteNodes[ lastIndex ]; for( int i = 1; i < strBytes.Length; ++i ) { byte currentByte = strBytes[ i ]; int index = lastNode._firstChild; TrieNode currentNode; for( ; ; ) { if( index == 0 ) { return -1; } currentNode = LoadNodeByIndex( index ); if( currentNode == null ) { return -1; } if( currentNode._byte == currentByte ) { lastNode = currentNode; lastIndex = index; break; } index = currentNode._sibling; } } _stringCache.CacheObject( str, lastIndex ); return lastIndex; } public string GetStringByIndex( int index ) { ArrayList bytes = new ArrayList(); TrieNode node; while( index != 0 ) { node = LoadNodeByIndex( index ); if( node == null ) { return null; } bytes.Insert( 0, node._byte ); index = node.Parent; } return ( bytes.Count > 0 ) ? Encoding.UTF8.GetString( (byte[]) bytes.ToArray( typeof( byte ) ) ) : null; } public ArrayList GetMatchingStrings( string wildcard, bool tokensOnly ) { ArrayList result = new ArrayList(); IntArrayList indexes = new IntArrayList(); if( wildcard.EndsWith( "*" ) ) { wildcard = wildcard.TrimEnd( '*' ); GetSubtree( GetStringIndex( wildcard ), indexes, tokensOnly ); for( int i = 0; i < indexes.Count; ++i ) { string str = GetStringByIndex( indexes[ i ] ); if( str != null ) { result.Add( str ); } } } else if( wildcard.EndsWith( "?" ) ) { int level = 1; int wcLength = wildcard.Length; while( ( wildcard = wildcard.Remove( wcLength - level, 1 ) ).EndsWith( "?" ) ) { ++level; } GetSubtree( GetStringIndex( wildcard ), indexes, tokensOnly ); for( int i = 0; i < indexes.Count; ++i ) { string token = GetStringByIndex( indexes[ i ] ); if( token != null && token.Length == wcLength ) { result.Add( token ); } } } else { result.Add( wildcard ); } return result; } private void GetSubtree( int index, IntArrayList wards, bool tokensOnly ) { if( index >= 0 ) { TrieNode node = LoadNodeByIndex( index ); if( !tokensOnly || node.IsToken ) { wards.Add( index ); } index = node._firstChild; while( index != 0 ) { GetSubtree( index, wards, tokensOnly ); node = LoadNodeByIndex( index ); index = node._sibling; } } } #region IDisposable Members public void Dispose() { Flush(); _storage.Close(); Trace.WriteLine( "ExternalTrie(" + _fileName + "): nodes cache hit rate = " + (int) ( _nodesCache.HitRate() * 100 ) + '%' ); Trace.WriteLine( "ExternalTrie(" + _fileName + "): strings cache hit rate = " + (int) ( _stringCache.HitRate() * 100 ) + '%' ); } #endregion #region implementation details private class TrieNode { public byte _byte; public int _firstChild; public int _sibling; public int _parent; public bool IsToken { get { return ( _parent & 1 ) == 1; } set { if( value ) { _parent |= 1; } else { _parent = ( _parent >> 1 ) << 1; } } } public int Parent { get { return _parent >> 1; } set { _parent = value << 1; } } } private void Init( string fileName ) { _fileName = fileName; FileStream stream = new FileStream( fileName, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read, 256 ); if( _cachingStrategy != null ) { _storage = new CachedStream( stream, _cachingStrategy ); } else { _storage = new CachedStream( stream, 1 << 18 ); } _writer = new BinaryWriter( _storage ); _reader = new BinaryReader( _storage ); _firstByteNodes = new TrieNode[ 256 ]; for( int i = 0; i < 256; ++i ) { TrieNode node = new TrieNode(); _isTrieComplete = LoadNode( node ); if( !_isTrieComplete ) { node._byte = (byte) i; node.Parent = 0; node.IsToken = false; SaveNode( node ); Flush(); } _firstByteNodes[ i ] = node; } _stringCache = new ObjectCache( _stringCachesize ); _nodesCache = new IntObjectCache( _nodesCacheSize ); _nodesCache.ObjectRemoved += _nodesCache_ObjectRemoved; _isDirty = false; } /** * Loads node from current location */ private bool LoadNode( TrieNode node ) { try { node._byte = (byte)_storage.ReadByte(); node._firstChild = _reader.ReadInt32(); node._sibling = _reader.ReadInt32(); node._parent = _reader.ReadInt32(); } catch( EndOfStreamException ) { return false; } return true; } /** * Loads node by specified index */ private TrieNode LoadNodeByIndex( int index ) { TrieNode node = (TrieNode) _nodesCache.TryKey( index ); if( node == null ) { if( Seek2NodeByIndex( index ) ) { node = AllocNode(); LoadNode( node ); _nodesCache.CacheObject( index, node ); } } return node; } /** * Saves specified node to current location */ private void SaveNode( TrieNode node ) { _storage.WriteByte( node._byte ); _writer.Write( node._firstChild ); _writer.Write( node._sibling ); _writer.Write( node._parent ); _isDirty = true; } /** * Seeks to the specified index location */ private bool Seek2NodeByIndex( int index ) { long position = ( (long) index ) * _recordSize; if( position >= 0 && _storage.Length >= position ) { _storage.Position = position; return true; } return false; } /** * Saves node by its index */ private void SaveNodeByIndex( TrieNode node, int index ) { _nodesCache.Remove( index ); Seek2NodeByIndex( index ); SaveNode( node ); } /** * saves new node to end and returns its index */ private int SaveNode2End( TrieNode node ) { _storage.Seek( 0, SeekOrigin.End ); SaveNode( node ); return FirstFreeIndex - 1; } private TrieNode AllocNode() { if( _freeNode == null ) { return new TrieNode(); } TrieNode result = _freeNode; _freeNode = null; return result; } private void _nodesCache_ObjectRemoved( object sender, IntObjectCacheEventArgs e ) { _freeNode = (TrieNode) e.Object; } #endregion private string _fileName; private Stream _storage; private BinaryWriter _writer; private BinaryReader _reader; private TrieNode[] _firstByteNodes; private ObjectCache _stringCache; private IntObjectCache _nodesCache; private TrieNode _freeNode; private bool _isTrieComplete; private bool _isDirty; private ICachingStrategy _cachingStrategy; } }