///
/// Copyright © 2003-2008 JetBrains s.r.o.
/// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder).
///
using System;
using System.IO;
using System.Text;
using JetBrains.DataStructures;
using JetBrains.Omea.HTML;
using NUnit.Framework;
namespace OmniaMeaBaseTests
{
[TestFixture]
public class HTMLParserTests
{
private HTMLParser CreateParser( string html )
{
return new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( html ) ) ) );
}
///
/// Invokes parser repeatedly to read all the fragments.
/// Writes the fragments to a string, separates them with spaces (trailing space is added too!).
///
///
///
private string ReadAllFragments( HTMLParser parser )
{
StringBuilder sb = new StringBuilder();
while( !parser.Finished )
sb.Append( parser.ReadNextFragment() );
try
{
if( parser.ReadNextFragment().Length != 0 )
throw new InvalidOperationException( "Parser must return an empty fragment having read the whole text (if there's a tag after the last returned meaningful string)." );
throw new InvalidOperationException( "Parser must throw an exception if reading beyond end of stream." );
}
catch( EndOfStreamException ) // It's expected
{
}
return sb.ToString();
}
[Test]
public void NoBody()
{
string noBodyHTML = "
text to be ignored ";
using( HTMLParser parser = CreateParser( noBodyHTML ) )
{
if( parser.ReadNextFragment().Length > 0 )
throw new Exception( "Text outside HTML body is read!" );
}
}
[Test]
public void NoBodyNoWordBreak()
{
string noBodyHTML = " text to be ignored ";
using( HTMLParser parser = CreateParser( noBodyHTML ) )
{
parser.BreakWords = false;
if( parser.ReadNextFragment().Length > 0 )
throw new Exception( "Text outside HTML body is read!" );
}
}
[Test]
public void SimpleBody()
{
string noBodyHTML = "text in bodytext to be ignored";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( noBodyHTML ) ) ) ) )
{
Assert.AreEqual( "text in body", ReadAllFragments( parser ), "Invalid simple body text!" );
}
}
[Test]
public void SimpleBodyNoWordBreak()
{
string noBodyHTML = "text in bodytext to be ignored";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( noBodyHTML ) ) ) ) )
{
parser.BreakWords = false;
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("text in body", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("", parser.ReadNextFragment());
Assert.AreEqual( true, parser.Finished );
}
}
[Test]
public void QuotesInTag()
{
string HTML = "1st frag 2nd frag ";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
Assert.AreEqual( "1st frag 2nd frag ", ReadAllFragments( parser ), "Invalid fragments!" );
}
}
[Test]
public void QuotesInTagNoWordBreak()
{
string HTML = "
1st frag 2nd frag ";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
parser.BreakWords = false;
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("1st frag", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual(" 2nd frag ", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("", parser.ReadNextFragment());
Assert.AreEqual( true, parser.Finished );
}
}
[Test]
public void Title()
{
string HTML = "
The title1st frag 2nd frag ";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
Assert.AreEqual( "The title1st frag 2nd frag ", ReadAllFragments( parser ), "Invalid fragments!" );
}
}
[Test]
public void TitleNoWordBreak()
{
string HTML = "
The title1st frag 2nd frag ";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
parser.BreakWords = false;
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("The title", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("1st frag", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual(" 2nd frag ", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("", parser.ReadNextFragment());
Assert.AreEqual( true, parser.Finished );
}
}
[Test]
public void Scripts()
{
string HTML = "
The title1st frag 2nd frag ";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
Assert.AreEqual( "The title1st frag 2nd frag ", ReadAllFragments( parser ), "Invalid fragments!" );
}
}
[Test]
public void ScriptsNoWordBreak()
{
string HTML = "
The title1st frag 2nd frag ";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
parser.BreakWords = false;
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("The title", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("1st frag", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual(" 2nd frag ", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("", parser.ReadNextFragment());
Assert.AreEqual( true, parser.Finished );
}
}
[Test]
public void CharEntityReferences()
{
string HTML = "
include <list>
include "omniamea.h"
#include «Kama—Sutra»
";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
Assert.AreEqual( "include include \"omniamea.h\"#include «Kama—Sutra»", ReadAllFragments( parser ), "Invalid fragments!" );
}
}
[Test]
public void CharEntityReferencesNoWordBreak()
{
string HTML = "include <list>
include "omniamea.h"
#include «Kama—Sutra»
";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
parser.BreakWords = false;
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("include ", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("include \"omniamea.h\"", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("#include «Kama—Sutra»", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("", parser.ReadNextFragment());
Assert.AreEqual( true, parser.Finished );
}
}
[Test]
public void Charset()
{
string HTML = "1st frag";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
Assert.AreEqual( "1st frag", ReadAllFragments( parser ), "Invalid fragments" );
Assert.AreEqual( "windows-1251", parser.CharSet, "Invalid charset!" );
}
}
[Test]
public void CharsetNoWordBreak()
{
string HTML = "1st frag";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
parser.BreakWords = false;
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("1st frag", parser.ReadNextFragment());
Assert.AreEqual( false, parser.Finished );
Assert.AreEqual("", parser.ReadNextFragment());
Assert.AreEqual( true, parser.Finished );
Assert.AreEqual( "windows-1251", parser.CharSet, "Invalid charset!" );
}
}
[Test]
public void Finishing()
{
string HTML = "The title1st frag 2nd frag \n";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
while( !parser.Finished )
{
parser.ReadNextFragment();
}
}
}
[Test]
public void FinishingNoWordBreak()
{
string HTML = "
The title1st frag 2nd frag \n";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
parser.BreakWords = false;
while( !parser.Finished )
{
parser.ReadNextFragment();
}
}
}
[Test]
public void FinishingOnUnclosed()
{
string HTML = "
The title";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
parser.BreakWords = false;
int a;
for(a = 0; (a < 0x1000) && ( !parser.Finished ); a++)
parser.ReadNextFragment();
if(!(a < 1000))
Assert.Fail( "The parser has failed to finish." );
}
}
[Test]
public void FinishingOnUnclosedNoWordBreak()
{
string HTML = "The title";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
parser.BreakWords = false;
int a;
for(a = 0; (a < 0x1000) && ( !parser.Finished ); a++)
parser.ReadNextFragment();
if(!(a < 1000))
Assert.Fail( "The parser has failed to finish." );
}
}
[Test]
public void FinishingOnOverclosed()
{
string HTML = "The title";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
int a;
for(a = 0; (a < 0x1000) && ( !parser.Finished ); a++)
parser.ReadNextFragment();
if(!(a < 1000))
Assert.Fail( "The parser has failed to finish." );
}
}
[Test]
public void FinishingOnOverclosedNoWordBreak()
{
string HTML = "The title";
using( HTMLParser parser = new HTMLParser(
new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) )
{
parser.BreakWords = false;
int a;
for(a = 0; (a < 0x1000) && ( !parser.Finished ); a++)
parser.ReadNextFragment();
if(!(a < 1000))
Assert.Fail( "The parser has failed to finish." );
}
}
[Test]
public void Attributes()
{
using( HTMLParser parser = CreateParser( "" ) )
{
HashMap hashMap = parser.ParseAttributes( "link rel=\"stylesheet\" HRef=\"/styles-site.css\" type = 'text/css' /" );
Assert.AreEqual( 3, hashMap.Count );
Assert.AreEqual( "stylesheet", hashMap[ "rel" ] );
Assert.AreEqual( "/styles-site.css", hashMap[ "href" ] );
Assert.AreEqual( "text/css", hashMap[ "type" ] );
}
}
[Test]
public void AttributesNoWordBreak()
{
using( HTMLParser parser = CreateParser( "" ) )
{
parser.BreakWords = false;
HashMap hashMap = parser.ParseAttributes( "link rel=\"stylesheet\" HRef=\"/styles-site.css\" type = 'text/css' /" );
Assert.AreEqual( 3, hashMap.Count );
Assert.AreEqual( "stylesheet", hashMap[ "rel" ] );
Assert.AreEqual( "/styles-site.css", hashMap[ "href" ] );
Assert.AreEqual( "text/css", hashMap[ "type" ] );
}
}
}
[TestFixture]
public class HtmlEntityReaderTests
{
private HtmlEntityReader _reader = null;
[SetUp, TearDown]
public void Clean()
{
_reader = null;
}
protected void Seed( string text )
{
_reader = new HtmlEntityReader( new StringReader( text ) );
}
[Test]
public void Plain()
{
string seed = "Come and <see>";
Seed( seed );
StringBuilder sb = new StringBuilder();
int len;
while( !_reader.Eof )
{
sb.Append( (char) _reader.Read( false, true, out len ) );
Assert.AreEqual( len, 1 );
}
Assert.AreEqual( sb.ToString(), seed );
}
[Test]
public void Entities()
{
string seed = "Come and <see> — © «HornHoof™ Inc»";
Seed( seed );
StringBuilder sb = new StringBuilder();
int len;
char ch;
while( !_reader.Eof )
{
ch = (char) _reader.Read( true, true, out len );
sb.Append( ch );
switch( ch )
{
case (char) 160:
Assert.AreEqual( len, 6 );
break;
case '<':
goto case '>';
case '>':
Assert.AreEqual( len, 4 );
break;
case '—':
Assert.AreEqual( len, "—".Length );
break;
case '©':
Assert.AreEqual( len, "©".Length );
break;
case '«':
Assert.AreEqual( len, "«".Length );
break;
case '»':
Assert.AreEqual( len, "»".Length );
break;
case '™':
Assert.AreEqual( len, "™".Length );
break;
default:
Assert.AreEqual( len, 1 );
break;
}
}
Assert.AreEqual( sb.ToString(), "Come" + (char) 160 + "and — © «HornHoof™ Inc»" );
}
[Test]
public void PeekPlain()
{
string seed = "Come and <see>";
Seed( seed );
StringBuilder sb = new StringBuilder();
int len;
char chPeek, chRead;
while( !_reader.Eof )
{
chPeek = (char) _reader.Read( false, false, out len );
Assert.AreEqual( len, 1 );
chRead = (char) _reader.Read( false, true, out len );
Assert.AreEqual( len, 1 );
Assert.AreEqual( chPeek, chRead );
sb.Append( chRead );
}
Assert.AreEqual( sb.ToString(), seed );
}
[Test]
public void PeekEntity()
{
string seed = "Come and <see>";
Seed( seed );
StringBuilder sb = new StringBuilder();
int len;
int lenTest;
char chPeek, chRead;
while( !_reader.Eof )
{
chPeek = (char) _reader.Read( true, false, out len );
switch( chPeek )
{
case (char) 160:
lenTest = 6;
break;
case '<':
goto case '>';
case '>':
lenTest = 4;
break;
default:
lenTest = 1;
break;
}
Assert.AreEqual( len, lenTest );
chRead = (char) _reader.Read( true, true, out len );
Assert.AreEqual( len, lenTest );
Assert.AreEqual( chPeek, chRead );
sb.Append( chRead );
}
Assert.AreEqual( sb.ToString(), "Come" + (char) 160 + "and " );
}
[Test]
public void PeekMixed()
{
string seed = "Come and <see>";
Seed( seed );
StringBuilder sbPlain = new StringBuilder();
StringBuilder sbEntity = new StringBuilder();
int len;
int lenTest;
char chPeek, chRead;
while( !_reader.Eof )
{
chPeek = (char) _reader.Read( true, false, out len );
switch( chPeek )
{
case (char) 160:
lenTest = 6;
break;
case '<':
goto case '>';
case '>':
lenTest = 4;
break;
default:
lenTest = 1;
break;
}
Assert.AreEqual( lenTest, len );
sbEntity.Append( chPeek );
chRead = (char) _reader.Read( false, true, out len );
Assert.AreEqual( len, 1 );
sbPlain.Append( chRead );
}
Assert.AreEqual( sbPlain.ToString(), seed );
Assert.AreEqual( sbEntity.ToString(), "Come" + (char) 160 + "#160;and gt;" );
}
}
}