///
/// Copyright © 2003-2008 JetBrains s.r.o.
/// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder).
///
using System;
using System.Collections;
using System.Diagnostics;
using System.IO;
using System.Reflection;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Xml;
using JetBrains.DataStructures;
using JetBrains.Omea.Base;
using JetBrains.Omea.Containers;
using JetBrains.Omea.TextIndex;
namespace JetBrains.Omea.HTML
{
///
/// For given stream, HTML parser returns sequence of text fragments.
///
///
/// A fragment is returned by only if it is situated
/// in body or in title, and not in script or any other place outside the above mentioned ones.
///
/// The property allows to check which fragment is currently being processed.
///
/// Each read fragment can be a simple fragment and a heading,
/// it can be verified by InHeading property.
///
public class HTMLParser : IDisposable
{
///
/// Determines the behavior in responce to individual HTML tags, fetches the attributes for indexing, and so on.
///
public delegate void TagHandler( HTMLParser instance, string tag );
private bool _closeReader = true;
internal class CaseInsensitiveCharComparer: IComparer
{
#region IComparer Members
public int Compare( object x, object y )
{
return Char.ToLower( ( char ) x ) - Char.ToLower( ( char ) y );
}
#endregion
}
static HTMLParser()
{
_tagsTrie = new CharTrie( new CaseInsensitiveCharComparer() );
_tagsHandlers = new HashMap();
_tagsHandlers.Add( _tagsTrie.Add( "meta" ), new TagHandler( HandleMeta ) );
_tagsHandlers.Add( _tagsTrie.Add( "title" ), new TagHandler( OpeningTitle ) );
_tagsHandlers.Add( _tagsTrie.Add( "/title" ), new TagHandler( ClosingTitle ) );
_tagsHandlers.Add( _tagsTrie.Add( "body" ), new TagHandler( OpeningBody ) );
_tagsHandlers.Add( _tagsTrie.Add( "/body" ), new TagHandler( ClosingBody ) );
_tagsHandlers.Add( _tagsTrie.Add( "script" ), new TagHandler( OpeningScript ) );
_tagsHandlers.Add( _tagsTrie.Add( "/script" ), new TagHandler( ClosingScript ) );
_tagsHandlers.Add( _tagsTrie.Add( "h1" ), new TagHandler( OpeningHeading ) );
_tagsHandlers.Add( _tagsTrie.Add( "h2" ), new TagHandler( OpeningHeading ) );
_tagsHandlers.Add( _tagsTrie.Add( "h3" ), new TagHandler( OpeningHeading ) );
_tagsHandlers.Add( _tagsTrie.Add( "h4" ), new TagHandler( OpeningHeading ) );
_tagsHandlers.Add( _tagsTrie.Add( "h5" ), new TagHandler( OpeningHeading ) );
_tagsHandlers.Add( _tagsTrie.Add( "h6" ), new TagHandler( OpeningHeading ) );
_tagsHandlers.Add( _tagsTrie.Add( "/h1" ), new TagHandler( ClosingHeading ) );
_tagsHandlers.Add( _tagsTrie.Add( "/h2" ), new TagHandler( ClosingHeading ) );
_tagsHandlers.Add( _tagsTrie.Add( "/h3" ), new TagHandler( ClosingHeading ) );
_tagsHandlers.Add( _tagsTrie.Add( "/h4" ), new TagHandler( ClosingHeading ) );
_tagsHandlers.Add( _tagsTrie.Add( "/h5" ), new TagHandler( ClosingHeading ) );
_tagsHandlers.Add( _tagsTrie.Add( "/h6" ), new TagHandler( ClosingHeading ) );
}
///
/// Creates HTML parser over TextReader.
///
/// Provides the content to be converted to text.
public HTMLParser( TextReader reader )
{
_reader = new HtmlEntityReader( reader );
_finished = _reader.Peek() == -1; // Mark as finished if there are no characters in the stream
_tagBuilder = StringBuilderPool.Alloc();
_fragmentBuilder = StringBuilderPool.Alloc();
_charset = string.Empty;
_title = string.Empty;
_inBody = _inHeading = _inScript = _inTitle = false;
_localTagsTrie = null;
_localTagsHandlers = null;
}
///
/// Creates HTML parser over TextReader
///
///
/// If is set to true, parse all, not only body
public HTMLParser( TextReader reader, bool parseAll )
: this( reader )
{
_inBody = parseAll;
}
public bool CloseReader
{
get { return _closeReader; }
set { _closeReader = value; }
}
#region IDisposable Members
public void Dispose()
{
if ( _closeReader )
{
_reader.Close();
}
StringBuilderPool.Dispose( _tagBuilder );
StringBuilderPool.Dispose( _fragmentBuilder );
}
#endregion
public void AddTagHandler( string tag, TagHandler handler )
{
if( _localTagsTrie == null )
{
_localTagsTrie = new CharTrie( new CaseInsensitiveCharComparer() );
_localTagsHandlers = new HashMap();
}
_localTagsHandlers.Add( _localTagsTrie.Add( tag ), handler );
}
///
/// Reads a text fragment (text node, or attribute value, etc) form the HTML stream.
///
/// Text fragment without any HTML formatting and with the entities substituted.
public string ReadNextFragment()
{
int start;
return ReadNextFragment( out start );
}
///
/// Reads a text fragment (text node, or attribute value, etc) form the HTML stream. Provides the information on what was the starting position of the HTML representation of this fragment in the HTML stream.
///
/// Starting position of the current text fragment in the HTML stream, or -1 if some failure has occured.
/// Text fragment without any HTML formatting and with the entities substituted.
///
/// It is an error to read beyond the end of HTML stream. Check the property value before calling this function.
///
public string ReadNextFragment(out int start)
{
if(_finished)
throw new EndOfStreamException( "Cannot read beyond the end of HTML stream. Please mind the Finished property." );
_fragmentBuilder.Length = 0;
do // This loop avoids returning empty fragments
{
start = -1; // In case of failure, return -1
char lastReadChar;
try
{
// Read thru any tags preceeding the text node
string tag;
while( ((lastReadChar = _reader.PeekChar( false )) == '<') || (!_inBody && !_inTitle) || _inScript )
{
lastReadChar = _reader.ReadChar( false );
if( lastReadChar == '<' )
{
tag = ReadTag();
object handler;
lock( _tagsHandlers )
{
handler = _tagsHandlers[ _tagsTrie.GetMatchingNode( tag ) ];
}
if( handler != null )
{
((TagHandler) handler)( this, tag );
}
if( _localTagsTrie != null )
{
handler = _localTagsHandlers[ _localTagsTrie.GetMatchingNode( tag ) ];
if( handler != null )
{
((TagHandler) handler)( this, tag );
}
}
}
}
// We're in between the tags and the text node. Remember this position
start = _reader.Position;
if(_doBreakWords) // As the word ends, stop and return it (along with all the characters following the word)
{
// Collect the next token from the text node, up to the first spacing char
while( _reader.PeekChar( false ) != '<' ) // Do not subst the entities for bracket here
{
if(TextDelimitingCategories.IsDelimiter( _reader.PeekChar(true) )) // Peek with substitution
break;
_fragmentBuilder.Append( _reader.ReadChar( true ) ); // Read with substitution and append to the output
}
// Collect all the spacing chars following the token, up to the next token or an html tag
while( _reader.PeekChar( false ) != '<' ) // Do not subst the entities for bracket here
{
if(!TextDelimitingCategories.IsDelimiter( _reader.PeekChar(true) )) // Peek with substitution
break;
_fragmentBuilder.Append( _reader.ReadChar( true ) ); // Read with substitution and append to the output
}
}
else // Do not break the words, return the whole fragment (up to the next tag)
{
// Collect the next token from the text node, up to the opening angle bracked of the next tag
while( _reader.PeekChar( false ) != '<' ) // Do not subst the entities for bracket here
_fragmentBuilder.Append( _reader.ReadChar( true ) ); // Read with substitution and append to the output
}
}
catch( EndOfStreamException )
{
_finished = true;
}
}while((!_finished) && (_fragmentBuilder.Length == 0)); // Keep trying until we collect some text. Do not return empty strings
string result = _fragmentBuilder.ToString();
// store title in property
if( _inTitle )
_title = result;
return result;
}
public HashMap ParseAttributes( string tag )
{
HashMap result = new HashMap();
int pos = 0;
// tag attr="value1" attr2='value2'
pos = SkipNonWhitespace( tag, pos );
pos = SkipWhitespace( tag, pos );
while( pos < tag.Length )
{
int attrNameStart = pos;
while( pos < tag.Length && Char.IsLetterOrDigit( tag, pos ) )
{
pos++;
}
int attrNameEnd = pos;
pos = SkipWhitespace( tag, pos );
if( pos < tag.Length && tag[ pos ] == '=' )
{
pos++;
pos = SkipWhitespace( tag, pos );
if( pos < tag.Length && (tag[ pos ] == '\'' || tag[ pos ] == '\"') )
{
char startChar = tag[ pos ];
pos++;
int attrValueStart = pos;
while( pos < tag.Length && tag[ pos ] != startChar )
{
pos++;
}
if( pos < tag.Length && tag[ pos ] == startChar )
{
string attrName = tag.Substring( attrNameStart, attrNameEnd - attrNameStart ).ToLower();
string attrValue = tag.Substring( attrValueStart, pos - attrValueStart );
result[ attrName ] = attrValue;
}
}
}
pos = SkipNonWhitespace( tag, pos );
pos = SkipWhitespace( tag, pos );
}
return result;
}
#region properties
public string CharSet
{
get { return _charset; }
}
public string Title
{
get { return _title; }
}
public bool InBody
{
get { return _inBody; }
}
public bool InHeading
{
get { return _inHeading; }
}
public bool InTitle
{
get { return _inTitle; }
}
public bool InScript
{
get { return _inScript; }
}
public bool Finished
{
get { return _finished; }
}
///
/// Determines whether parser should break its output to individual words and return each word separately, or not.
///
/// 518
public bool BreakWords
{
get
{
return _doBreakWords;
}
set
{
_doBreakWords = value;
}
}
#endregion
///
/// Returns whole tag including all attributes.
///
///
protected internal string ReadTag()
{
_tagBuilder.Length = 0;
bool inQuotes = false;
char lastReadChar;
while( ((lastReadChar = _reader.ReadChar( false )) != '>') || (inQuotes) )
{
if( lastReadChar == '\"' )
inQuotes = !inQuotes;
_tagBuilder.Append( lastReadChar );
}
return _tagBuilder.ToString();
}
private static int SkipNonWhitespace( string tag, int pos )
{
while( pos < tag.Length && !Char.IsWhiteSpace( tag, pos ) )
{
pos++;
}
return pos;
}
private static int SkipWhitespace( string tag, int pos )
{
while( pos < tag.Length && Char.IsWhiteSpace( tag, pos ) )
{
pos++;
}
return pos;
}
///
/// Position in the input HTML stream, which is the number of characters consumed and converted into text by this moment.
///
///
/// These are the characters in HTML representation, as opposed to the plain text characters.
///
public int Position
{
get { return _reader.Position; }
}
#region tag handlers
/**
* TODO: the tag should be honestly parsed
*/
private static void HandleMeta( HTMLParser instance, string tag )
{
tag = tag.ToLower();
int index = tag.IndexOf( "http-equiv" );
if( index > 0 && tag.IndexOf( "\"content-type\"", index + 10 ) > 0 )
{
index = tag.IndexOf( "content", index + 10 );
if( index > 0 )
{
index = tag.IndexOf( "charset=", index + 7 );
if( index > 0 )
{
index += 8; // length of "charset="
int charsetEnd = tag.IndexOfAny( new char[] {'"', '\'', ' ', ';', ','}, index );
if( charsetEnd <= index )
{
charsetEnd = tag.Length;
}
instance._charset = tag.Substring( index, charsetEnd - index ).Trim();
}
}
}
}
private static void OpeningTitle( HTMLParser instance, string tag )
{
instance._inTitle = true;
}
private static void ClosingTitle( HTMLParser instance, string tag )
{
instance._inTitle = false;
}
private static void OpeningBody( HTMLParser instance, string tag )
{
instance._inBody = true;
}
private static void ClosingBody( HTMLParser instance, string tag )
{
instance._inBody = false;
}
private static void OpeningScript( HTMLParser instance, string tag )
{
instance._inScript = true;
}
private static void ClosingScript( HTMLParser instance, string tag )
{
instance._inScript = false;
}
private static void OpeningHeading( HTMLParser instance, string tag )
{
instance._inHeading = true;
}
private static void ClosingHeading( HTMLParser instance, string tag )
{
instance._inHeading = false;
}
#endregion
///
/// A reader that provides the unput text.
///
protected internal HtmlEntityReader _reader;
protected internal bool _finished;
protected internal StringBuilder _tagBuilder;
protected internal StringBuilder _fragmentBuilder;
protected internal string _charset;
protected internal string _title;
protected internal bool _inBody;
protected internal bool _inHeading;
protected internal bool _inScript;
protected internal bool _inTitle;
protected internal static CharTrie _tagsTrie;
protected internal static HashMap _tagsHandlers;
protected internal CharTrie _localTagsTrie;
protected internal HashMap _localTagsHandlers;
///
/// Determines whether parser should break its output to individual words and return each word separately, or not.
///
protected internal bool _doBreakWords = true;
}
public class HtmlTools
{
private static readonly Regex _rxStripHTML = new Regex( "<[^<>]+>" );
private static readonly Regex _rxLineBreak = new Regex( "< *br */ *>" );
private static readonly HtmlLinkConverter _htmlLinkConverter = new HtmlLinkConverter();
///
/// Tries to detect charset from html stream of resource
/// if charset is not set returns the name of default encoding.
///
public static string DetectCharset( TextReader reader )
{
string charset = Encoding.Default.HeaderName;
using( HTMLParser parser = new HTMLParser( reader ) )
{
parser.CloseReader = false;
while( !parser.Finished )
{
parser.ReadNextFragment();
if( parser.InBody )
{
if( parser.CharSet.Length > 0 )
{
charset = parser.CharSet;
}
break;
}
}
}
return charset.Replace( '_', '-' );
}
///
/// Skips scripts from a document.
///
public static string SkipScripts( string htmlText )
{
string text = htmlText.ToLower();
int scriptOffset = 0;
int offset;
StringBuilder skipper = StringBuilderPool.Alloc();
try
{
while( scriptOffset < text.Length &&
(offset = text.IndexOf( "