/// /// Copyright © 2003-2008 JetBrains s.r.o. /// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder). /// using System; using System.IO; using System.Text; using JetBrains.DataStructures; using JetBrains.Omea.HTML; using NUnit.Framework; namespace OmniaMeaBaseTests { [TestFixture] public class HTMLParserTests { private HTMLParser CreateParser( string html ) { return new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( html ) ) ) ); } /// /// Invokes parser repeatedly to read all the fragments. /// Writes the fragments to a string, separates them with spaces (trailing space is added too!). /// /// /// private string ReadAllFragments( HTMLParser parser ) { StringBuilder sb = new StringBuilder(); while( !parser.Finished ) sb.Append( parser.ReadNextFragment() ); try { if( parser.ReadNextFragment().Length != 0 ) throw new InvalidOperationException( "Parser must return an empty fragment having read the whole text (if there's a tag after the last returned meaningful string)." ); throw new InvalidOperationException( "Parser must throw an exception if reading beyond end of stream." ); } catch( EndOfStreamException ) // It's expected { } return sb.ToString(); } [Test] public void NoBody() { string noBodyHTML = " text to be ignored "; using( HTMLParser parser = CreateParser( noBodyHTML ) ) { if( parser.ReadNextFragment().Length > 0 ) throw new Exception( "Text outside HTML body is read!" ); } } [Test] public void NoBodyNoWordBreak() { string noBodyHTML = " text to be ignored "; using( HTMLParser parser = CreateParser( noBodyHTML ) ) { parser.BreakWords = false; if( parser.ReadNextFragment().Length > 0 ) throw new Exception( "Text outside HTML body is read!" ); } } [Test] public void SimpleBody() { string noBodyHTML = "text in bodytext to be ignored"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( noBodyHTML ) ) ) ) ) { Assert.AreEqual( "text in body", ReadAllFragments( parser ), "Invalid simple body text!" ); } } [Test] public void SimpleBodyNoWordBreak() { string noBodyHTML = "text in bodytext to be ignored"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( noBodyHTML ) ) ) ) ) { parser.BreakWords = false; Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("text in body", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("", parser.ReadNextFragment()); Assert.AreEqual( true, parser.Finished ); } } [Test] public void QuotesInTag() { string HTML = "1st frag

2nd frag "; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { Assert.AreEqual( "1st frag 2nd frag ", ReadAllFragments( parser ), "Invalid fragments!" ); } } [Test] public void QuotesInTagNoWordBreak() { string HTML = "1st frag

2nd frag "; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { parser.BreakWords = false; Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("1st frag", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual(" 2nd frag ", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("", parser.ReadNextFragment()); Assert.AreEqual( true, parser.Finished ); } } [Test] public void Title() { string HTML = "The title1st frag

2nd frag "; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { Assert.AreEqual( "The title1st frag 2nd frag ", ReadAllFragments( parser ), "Invalid fragments!" ); } } [Test] public void TitleNoWordBreak() { string HTML = "The title1st frag

2nd frag "; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { parser.BreakWords = false; Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("The title", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("1st frag", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual(" 2nd frag ", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("", parser.ReadNextFragment()); Assert.AreEqual( true, parser.Finished ); } } [Test] public void Scripts() { string HTML = "The title1st frag

2nd frag "; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { Assert.AreEqual( "The title1st frag 2nd frag ", ReadAllFragments( parser ), "Invalid fragments!" ); } } [Test] public void ScriptsNoWordBreak() { string HTML = "The title1st frag

2nd frag "; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { parser.BreakWords = false; Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("The title", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("1st frag", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual(" 2nd frag ", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("", parser.ReadNextFragment()); Assert.AreEqual( true, parser.Finished ); } } [Test] public void CharEntityReferences() { string HTML = "

include <list>

include "omniamea.h"

#include «Kama—Sutra»

"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { Assert.AreEqual( "include include \"omniamea.h\"#include «Kama—Sutra»", ReadAllFragments( parser ), "Invalid fragments!" ); } } [Test] public void CharEntityReferencesNoWordBreak() { string HTML = "

include <list>

include "omniamea.h"

#include «Kama—Sutra»

"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { parser.BreakWords = false; Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("include ", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("include \"omniamea.h\"", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("#include «Kama—Sutra»", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("", parser.ReadNextFragment()); Assert.AreEqual( true, parser.Finished ); } } [Test] public void Charset() { string HTML = "1st frag"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { Assert.AreEqual( "1st frag", ReadAllFragments( parser ), "Invalid fragments" ); Assert.AreEqual( "windows-1251", parser.CharSet, "Invalid charset!" ); } } [Test] public void CharsetNoWordBreak() { string HTML = "1st frag"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { parser.BreakWords = false; Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("1st frag", parser.ReadNextFragment()); Assert.AreEqual( false, parser.Finished ); Assert.AreEqual("", parser.ReadNextFragment()); Assert.AreEqual( true, parser.Finished ); Assert.AreEqual( "windows-1251", parser.CharSet, "Invalid charset!" ); } } [Test] public void Finishing() { string HTML = "The title1st frag

2nd frag \n"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { while( !parser.Finished ) { parser.ReadNextFragment(); } } } [Test] public void FinishingNoWordBreak() { string HTML = "The title1st frag

2nd frag \n"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { parser.BreakWords = false; while( !parser.Finished ) { parser.ReadNextFragment(); } } } [Test] public void FinishingOnUnclosed() { string HTML = "The title"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { parser.BreakWords = false; int a; for(a = 0; (a < 0x1000) && ( !parser.Finished ); a++) parser.ReadNextFragment(); if(!(a < 1000)) Assert.Fail( "The parser has failed to finish." ); } } [Test] public void FinishingOnUnclosedNoWordBreak() { string HTML = "<HTML><HEAD><Title>The title"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { parser.BreakWords = false; int a; for(a = 0; (a < 0x1000) && ( !parser.Finished ); a++) parser.ReadNextFragment(); if(!(a < 1000)) Assert.Fail( "The parser has failed to finish." ); } } [Test] public void FinishingOnOverclosed() { string HTML = "<HTML><HEAD><Title>The title</</</</</</</a></a></html></head>"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { int a; for(a = 0; (a < 0x1000) && ( !parser.Finished ); a++) parser.ReadNextFragment(); if(!(a < 1000)) Assert.Fail( "The parser has failed to finish." ); } } [Test] public void FinishingOnOverclosedNoWordBreak() { string HTML = "The title</</</</</</</a></a></html></head>"; using( HTMLParser parser = new HTMLParser( new StreamReader( new MemoryStream( Encoding.Default.GetBytes( HTML ) ) ) ) ) { parser.BreakWords = false; int a; for(a = 0; (a < 0x1000) && ( !parser.Finished ); a++) parser.ReadNextFragment(); if(!(a < 1000)) Assert.Fail( "The parser has failed to finish." ); } } [Test] public void Attributes() { using( HTMLParser parser = CreateParser( "" ) ) { HashMap hashMap = parser.ParseAttributes( "link rel=\"stylesheet\" HRef=\"/styles-site.css\" type = 'text/css' /" ); Assert.AreEqual( 3, hashMap.Count ); Assert.AreEqual( "stylesheet", hashMap[ "rel" ] ); Assert.AreEqual( "/styles-site.css", hashMap[ "href" ] ); Assert.AreEqual( "text/css", hashMap[ "type" ] ); } } [Test] public void AttributesNoWordBreak() { using( HTMLParser parser = CreateParser( "" ) ) { parser.BreakWords = false; HashMap hashMap = parser.ParseAttributes( "link rel=\"stylesheet\" HRef=\"/styles-site.css\" type = 'text/css' /" ); Assert.AreEqual( 3, hashMap.Count ); Assert.AreEqual( "stylesheet", hashMap[ "rel" ] ); Assert.AreEqual( "/styles-site.css", hashMap[ "href" ] ); Assert.AreEqual( "text/css", hashMap[ "type" ] ); } } } [TestFixture] public class HtmlEntityReaderTests { private HtmlEntityReader _reader = null; [SetUp, TearDown] public void Clean() { _reader = null; } protected void Seed( string text ) { _reader = new HtmlEntityReader( new StringReader( text ) ); } [Test] public void Plain() { string seed = "Come and <see>"; Seed( seed ); StringBuilder sb = new StringBuilder(); int len; while( !_reader.Eof ) { sb.Append( (char) _reader.Read( false, true, out len ) ); Assert.AreEqual( len, 1 ); } Assert.AreEqual( sb.ToString(), seed ); } [Test] public void Entities() { string seed = "Come and <see> — © «HornHoof™ Inc»"; Seed( seed ); StringBuilder sb = new StringBuilder(); int len; char ch; while( !_reader.Eof ) { ch = (char) _reader.Read( true, true, out len ); sb.Append( ch ); switch( ch ) { case (char) 160: Assert.AreEqual( len, 6 ); break; case '<': goto case '>'; case '>': Assert.AreEqual( len, 4 ); break; case '—': Assert.AreEqual( len, "—".Length ); break; case '©': Assert.AreEqual( len, "©".Length ); break; case '«': Assert.AreEqual( len, "«".Length ); break; case '»': Assert.AreEqual( len, "»".Length ); break; case '™': Assert.AreEqual( len, "™".Length ); break; default: Assert.AreEqual( len, 1 ); break; } } Assert.AreEqual( sb.ToString(), "Come" + (char) 160 + "and — © «HornHoof™ Inc»" ); } [Test] public void PeekPlain() { string seed = "Come and <see>"; Seed( seed ); StringBuilder sb = new StringBuilder(); int len; char chPeek, chRead; while( !_reader.Eof ) { chPeek = (char) _reader.Read( false, false, out len ); Assert.AreEqual( len, 1 ); chRead = (char) _reader.Read( false, true, out len ); Assert.AreEqual( len, 1 ); Assert.AreEqual( chPeek, chRead ); sb.Append( chRead ); } Assert.AreEqual( sb.ToString(), seed ); } [Test] public void PeekEntity() { string seed = "Come and <see>"; Seed( seed ); StringBuilder sb = new StringBuilder(); int len; int lenTest; char chPeek, chRead; while( !_reader.Eof ) { chPeek = (char) _reader.Read( true, false, out len ); switch( chPeek ) { case (char) 160: lenTest = 6; break; case '<': goto case '>'; case '>': lenTest = 4; break; default: lenTest = 1; break; } Assert.AreEqual( len, lenTest ); chRead = (char) _reader.Read( true, true, out len ); Assert.AreEqual( len, lenTest ); Assert.AreEqual( chPeek, chRead ); sb.Append( chRead ); } Assert.AreEqual( sb.ToString(), "Come" + (char) 160 + "and " ); } [Test] public void PeekMixed() { string seed = "Come and <see>"; Seed( seed ); StringBuilder sbPlain = new StringBuilder(); StringBuilder sbEntity = new StringBuilder(); int len; int lenTest; char chPeek, chRead; while( !_reader.Eof ) { chPeek = (char) _reader.Read( true, false, out len ); switch( chPeek ) { case (char) 160: lenTest = 6; break; case '<': goto case '>'; case '>': lenTest = 4; break; default: lenTest = 1; break; } Assert.AreEqual( lenTest, len ); sbEntity.Append( chPeek ); chRead = (char) _reader.Read( false, true, out len ); Assert.AreEqual( len, 1 ); sbPlain.Append( chRead ); } Assert.AreEqual( sbPlain.ToString(), seed ); Assert.AreEqual( sbEntity.ToString(), "Come" + (char) 160 + "#160;and gt;" ); } } }