/// /// Copyright © 2003-2008 JetBrains s.r.o. /// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder). /// using System; using System.IO; using System.Text; using System.Xml; using JetBrains.Omea.Base; namespace JetBrains.Omea.RSSPlugin { /// /// Summary description for XmlPreparer. /// internal class XmlPreparer { private static string _HtmlEntities = null; private const int _Enc_Invalid = -1; //Bad Encoding private const int _Enc_UniCodeBE = 0; //Unicode Big Endian private const int _Enc_UniCode = 1; //Unicode Little Endian private const int _Enc_UCS4BE = 2; //UCS4 BigEndian private const int _Enc_UCS4BEB = 3; //UCS4 BigEnding with Byte order mark private const int _Enc_UCS4 = 4; //UCS4 Little Endian private const int _Enc_UCS4B = 5; //UCS4 Little Ending with Byte order mark private const int _Enc_UCS434 = 6; //UCS4 order 3412 private const int _Enc_UCS434B = 7; //UCS4 order 3412 with Byte order mark private const int _Enc_UCS421 = 8; //UCS4 order 2143 private const int _Enc_UCS421B = 9; //UCS4 order 2143 with Byte order mark private const int _Enc_EBCDIC = 10; //EBCDIC private const int _Enc_UTF8 = 11; //UTF8 private const int _Enc_ASCII = 12; //ASCII private static int[,] _EncodingTable = { // Unknown 0000 feff fffe efbb 3c00 003c 3f00 003f 3c3f 786d 4c6f a794 bf3c /*Unknown*/ {_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid }, /*0000*/ {_Enc_Invalid ,_Enc_Invalid ,_Enc_UCS4BEB ,_Enc_UCS421B ,_Enc_Invalid ,_Enc_UCS421 ,_Enc_UCS4BE ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid }, /*feff*/ {_Enc_UniCodeBE ,_Enc_UCS434 ,_Enc_UniCodeBE ,_Enc_UniCodeBE ,_Enc_UniCodeBE ,_Enc_UniCodeBE ,_Enc_UniCodeBE ,_Enc_UniCodeBE ,_Enc_UniCodeBE ,_Enc_UniCodeBE ,_Enc_UniCodeBE ,_Enc_UniCodeBE ,_Enc_UniCodeBE ,_Enc_Invalid }, /*fffe*/ {_Enc_UniCode ,_Enc_UCS4B ,_Enc_UniCode ,_Enc_UniCode ,_Enc_UniCode ,_Enc_UniCode ,_Enc_UniCode ,_Enc_UniCode ,_Enc_UniCode ,_Enc_UniCode ,_Enc_UniCode ,_Enc_UniCode ,_Enc_UniCode ,_Enc_Invalid }, /*efbb*/ {_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_UTF8 }, /*3c00*/ {_Enc_Invalid ,_Enc_UCS4 ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_UniCode ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid }, /*003c*/ {_Enc_Invalid ,_Enc_UCS434 ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_UniCodeBE ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid }, /*3f00*/ {_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid }, /*003f*/ {_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid }, /*3c3f*/ {_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_ASCII ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid }, /*786d*/ {_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid }, /*4c6f*/ {_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_EBCDIC ,_Enc_Invalid }, /*a794*/ {_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid }, /*bf3c*/ {_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid ,_Enc_Invalid } }; private Stream _stream = null; private string _encodingName = null; private Encoding _encoding = null; private int _skipBytes = 0; internal XmlPreparer( Stream stream, string encodingName ) { _stream = stream; _encodingName = encodingName; } internal bool PrepareXML() { // Read first 256 bytes and try to detect endoding by it. byte[] streamStartBytes = new byte[256]; int cBytes = _stream.Read( streamStartBytes, 0, 256 ); _stream.Seek( 0, SeekOrigin.Begin ); _encoding = GetEncoding( streamStartBytes, cBytes ); if( _encoding == null ) { if( _encodingName == null ) { return false; } try { _encoding = Encoding.GetEncoding( _encodingName ); } catch( NotSupportedException ) { } } return _encoding != null; } internal string GetXML() { if( _encoding == null ) { new InvalidOperationException( "GetXML() is called without sucessuful PrepareXML()" ); } byte[] buffer = new byte[ 4096 ]; char[] chars = new char[ buffer.Length ]; StringBuilder sb = StringBuilderPool.Alloc(); try { int read; Decoder dec = _encoding.GetDecoder(); if( _skipBytes > 0 ) { while( ( _skipBytes -= _stream.Read( buffer, 0, _skipBytes ) ) > 0 ); } while( ( read = _stream.Read( buffer, 0, buffer.Length ) ) > 0 ) { int cc; try { cc = dec.GetChars( buffer, 0, read, chars, 0 ); } catch( ArgumentException ) { chars = new char[ dec.GetCharCount( buffer, 0, read ) ] ; cc = dec.GetChars( buffer, 0, read, chars, 0 ); } sb.Append( chars, 0, cc ); } _stream.Seek( 0, SeekOrigin.Begin ); // Cut out restricted chars for(int i = 0; i < sb.Length; ++i ) { int c = sb[i]; // XML Spec 1.1, paragraph 2.1 and 2.2 // document ::= prolog element Misc* - Char* RestrictedChar Char* // RestrictedChar ::= [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F] if( (c >= 0x0001 && c <= 0x0008) || (c >= 0x000B && c <= 0x000C) || (c >= 0x000E && c <= 0x001F) || (c >= 0x007F && c <= 0x0084) || (c >= 0x0086 && c <= 0x009F) ) { sb[i] = '?'; } } return sb.ToString(); } finally { StringBuilderPool.Dispose( sb ); } } private Encoding GetEncoding( byte[] bytes, int cbytes ) { int enc = HaveBOM(bytes,cbytes); switch( enc ) { case _Enc_ASCII: case _Enc_Invalid: // Unknown, or ``" ); if( i > 0 ) { xml = xml.Substring( 0, i ); } // Try to find `` encoding="'' int encStart = xml.IndexOf( " encoding=" ); if( encStart > 0 ) { encStart += 10; char q = xml[ encStart ]; encStart += 1; int encEnd = xml.IndexOf( q, encStart ); if( encEnd < encStart ) { // No ``encoding=""'', it is UTF-8 return new UTF8Encoding( false, false ); } encEnd -= 1; string encoding = xml.Substring( encStart, encEnd - encStart + 1 ); if( encoding.ToUpper() == "UTF-8" ) { return new UTF8Encoding( false, false ); } else { try { return Encoding.GetEncoding( encoding ); } catch( NotSupportedException ) { // Unknown one return null; } catch( ArgumentException ) { // Unknown one return null; } } } // No "encoding" attribute in XML declaration, assume UTF-8 return new UTF8Encoding( false, false ); } private static int GetEncodingIndex(int word) { switch(word) { case 0x0000: return 1; case 0xfeff: return 2; case 0xfffe: return 3; case 0xefbb: return 4; case 0x3c00: return 5; case 0x003c: return 6; case 0x3f00: return 7; case 0x003f: return 8; case 0x3c3f: return 9; case 0x786d: return 10; case 0x4c6f: return 11; case 0xa794: return 12; case 0xbf3c: return 13; default: return 0; //unknown } } private static int HaveBOM( byte[] bytes, int cbytes ) { if( cbytes < 2 ) { return -1; } int index1 = GetEncodingIndex( bytes[0] << 8 | bytes[1] ); int index2 = 0; if( cbytes >= 4 ) { index2 = GetEncodingIndex( bytes[2] << 8 | bytes[3] ); } return _EncodingTable[ index1, index2 ]; } internal static string HtmlEntites() { if( _HtmlEntities != null ) { return _HtmlEntities; } StringBuilder sb = StringBuilderPool.Alloc(); try { XmlDocument xml = new XmlDocument(); xml.Load( JetBrains.Omea.HTML.HtmlEntityReader.GetHtmlEntitiesStream() ); foreach( XmlElement e in xml.GetElementsByTagName( "Entity" ) ) { int val = Int32.Parse( e.GetAttribute( "Value" ) ); sb.AppendFormat( "\n", e.GetAttribute( "Name" ), (char)val ); } _HtmlEntities = sb.ToString(); } catch { _HtmlEntities = ""; } finally { StringBuilderPool.Dispose( sb ); } return _HtmlEntities; } } }