///
/// Copyright © 2003-2008 JetBrains s.r.o.
/// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder).
///
using System;
using System.Collections;
using System.Collections.Specialized;
using System.Diagnostics;
using System.IO;
using System.Security.Cryptography;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Xml;
using JetBrains.DataStructures;
using JetBrains.Omea.Base;
using JetBrains.Omea.HTML;
using JetBrains.Omea.OpenAPI;
namespace JetBrains.Omea.RSSPlugin
{
public abstract class BaseFeedElementParser : IFeedElementParser
{
public abstract void ParseValue( IResource resource, XmlReader reader );
public virtual bool SkipNextRead { get { return false; } }
}
#region RSS Elements Parsers
public class FeedElementParser : BaseFeedElementParser
{
private readonly int _propId;
private readonly bool _override;
public FeedElementParser( int propID )
{
_propId = propID;
_override = false;
}
public FeedElementParser( int propID, bool isOverride )
{
_propId = propID;
_override = isOverride;
}
public override void ParseValue( IResource resource, XmlReader reader )
{
if ( _override || !resource.HasProp( _propId ) )
{
string strValue = reader.ReadString().Trim();
if ( Core.ResourceStore.PropTypes[ _propId ].DataType == PropDataType.Int )
{
try
{
resource.SetProp( _propId, Int32.Parse( strValue ) );
}
catch ( FormatException )
{
Trace.WriteLine( "Failed to parse integer value (Format)'" + strValue + "' for property " +
Core.ResourceStore.PropTypes[ _propId ].Name );
}
catch( OverflowException )
{
Trace.WriteLine( "Failed to parse integer value (Overflow)'" + strValue + "' for property " +
Core.ResourceStore.PropTypes[ _propId ].Name );
}
}
else if ( strValue.Length > 0 )
{
resource.SetProp( _propId, strValue );
}
}
}
}
public class FeedNameParser : FeedElementParser
{
public FeedNameParser() : base( Props.OriginalName ) {}
public override void ParseValue( IResource resource, XmlReader reader )
{
base.ParseValue( resource, reader );
if( !resource.HasProp( Core.Props.Name ) )
{
resource.SetProp( Core.Props.Name, resource.GetStringProp( Props.OriginalName ) );
}
}
}
internal class TitleParser : BaseFeedElementParser
{
public override void ParseValue( IResource resource, XmlReader reader )
{
resource.SetProp( Core.Props.Subject, HtmlTools.SafeHtmlDecode( reader.ReadString() ).Trim() );
}
}
internal class GUIDParser : BaseFeedElementParser
{
public override void ParseValue( IResource resource, XmlReader reader )
{
bool isPermalink = true;
if ( reader.MoveToAttribute( "isPermaLink" ) && reader.Value.ToLower() == "false" )
{
isPermalink = false;
reader.MoveToContent();
}
string guid = reader.ReadString();
resource.SetProp( Props.GUID, guid );
if ( isPermalink && !resource.HasProp( Props.Link ) )
{
resource.SetProp( Props.Link, guid );
}
}
}
internal class SourceTagParser : FeedElementParser
{
public SourceTagParser() : base( Props.RSSSourceTag )
{}
public override void ParseValue( IResource resource, XmlReader reader )
{
if ( reader.MoveToAttribute( "url" ) )
{
resource.SetProp( Props.RSSSourceTagUrl, reader.Value );
reader.MoveToContent();
}
base.ParseValue( resource, reader );
}
}
internal class EnclosureParser : BaseFeedElementParser
{
public override void ParseValue( IResource resource, XmlReader reader )
{
if ( reader.MoveToAttribute( "url" ) )
{
resource.SetProp( Props.EnclosureURL, reader.Value );
resource.SetProp( Props.EnclosureDownloadingState, DownloadState.NotDownloaded );
reader.MoveToContent();
}
if ( reader.MoveToAttribute( "length" ) )
{
try
{
int enclosureSize = Int32.Parse( reader.Value );
if ( enclosureSize < 0 )
{
enclosureSize = 0;
}
resource.SetProp( Props.EnclosureSize, enclosureSize );
}
catch ( FormatException )
{
Trace.WriteLine( "Failed to parse enclosure size " + reader.Value );
}
catch ( OverflowException )
{
Trace.WriteLine( "Enclosure size too large: " + reader.Value );
}
reader.MoveToContent();
}
if ( reader.MoveToAttribute( "type" ) )
{
resource.SetProp( Props.EnclosureType, reader.Value );
reader.MoveToContent();
}
}
}
internal class ImageParser : BaseFeedElementParser
{
private const string RDFNamespace = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
public override void ParseValue( IResource resource, XmlReader reader )
{
if ( reader.NodeType == XmlNodeType.Element && reader.Name == "image" && reader.IsEmptyElement )
{
// Try to extract "rdf:resource"
if( reader.MoveToAttribute( "resource", RDFNamespace ) )
{
resource.SetProp( Props.ImageURL, reader.Value );
reader.MoveToContent();
}
return;
}
while ( reader.Read() )
{
XmlNodeType type = reader.NodeType;
if ( type == XmlNodeType.EndElement && reader.Name == "image" )
{
break;
}
if ( type == XmlNodeType.Element && reader.Name == "title" )
{
resource.SetProp( Props.ImageTitle, reader.ReadString() );
}
if ( type == XmlNodeType.Element && reader.Name == "url" )
{
resource.SetProp( Props.ImageURL, reader.ReadString() );
}
if ( type == XmlNodeType.Element && reader.Name == "link" )
{
resource.SetProp( Props.ImageLink, reader.ReadString() );
}
}
}
}
internal class DCDateParser : BaseFeedElementParser
{
private class DateFragment
{
private readonly char _startChar;
private readonly int _digitCount;
private readonly bool _optional;
private int _value;
internal DateFragment( char startChar, int digitCount, bool optional )
{
_startChar = startChar;
_digitCount = digitCount;
_optional = optional;
}
internal int Parse( string dateStr, int startOffset )
{
if ( startOffset == dateStr.Length )
{
_value = 0;
return startOffset;
}
int offset = startOffset;
if ( _startChar != '\0' )
{
if ( dateStr[ offset ] == _startChar )
{
offset++;
}
else
{
if ( _optional )
{
_value = 0;
return offset;
}
throw new Exception( "Failed to parse date: starting char " + _startChar + " not found" );
}
}
_value = 0;
int foundDigits = 0;
while ( offset < dateStr.Length && ( _digitCount == 0 || foundDigits < _digitCount ) )
{
if ( !Char.IsDigit( dateStr, offset ) )
{
if ( _digitCount == 0 )
{
break;
}
throw new Exception( "Failed to find expected number of digits" );
}
_value = _value * 10 + dateStr[ offset ] - '0';
offset++;
foundDigits++;
}
return offset;
}
internal int Value { get { return _value; } }
}
private readonly int _propID;
private readonly DateFragment[] _fragments = new DateFragment[7];
private readonly DateFragment[] _tzFragments = new DateFragment[2];
public DCDateParser( int propID )
{
_propID = propID;
_fragments[ 0 ] = new DateFragment( '\0', 4, false );
_fragments[ 1 ] = new DateFragment( '-', 2, false );
_fragments[ 2 ] = new DateFragment( '-', 2, false );
_fragments[ 3 ] = new DateFragment( 'T', 2, false );
_fragments[ 4 ] = new DateFragment( ':', 2, false );
_fragments[ 5 ] = new DateFragment( ':', 2, true );
_fragments[ 6 ] = new DateFragment( '.', 0, true );
_tzFragments[ 0 ] = new DateFragment( '\0', 2, false );
_tzFragments[ 1 ] = new DateFragment( ':', 2, false );
}
public override void ParseValue( IResource resource, XmlReader reader )
{
string dateStr = reader.ReadString().Trim();
try
{
int offset = 0;
for ( int i = 0; i < 7; i++ )
{
offset = _fragments[ i ].Parse( dateStr, offset );
}
int tzModifier = 0;
if ( offset < dateStr.Length && ( dateStr[ offset ] == '+' || dateStr[ offset ] == '-' ) )
{
tzModifier = ( dateStr[ offset ] == '+' ) ? -1 : 1;
offset++;
offset = _tzFragments[ 0 ].Parse( dateStr, offset );
offset = _tzFragments[ 1 ].Parse( dateStr, offset );
}
int msec = _fragments[ 6 ].Value;
if ( msec > 999 ) // OM-7165
{
msec = 0;
}
DateTime dt = new DateTime(
_fragments[ 0 ].Value,
_fragments[ 1 ].Value,
_fragments[ 2 ].Value,
_fragments[ 3 ].Value,
_fragments[ 4 ].Value,
_fragments[ 5 ].Value,
msec );
dt = dt.AddHours( _tzFragments[ 0 ].Value * tzModifier );
dt = dt.AddMinutes( _tzFragments[ 1 ].Value * tzModifier );
resource.SetProp( _propID, dt.ToLocalTime() );
}
catch ( Exception e )
{
Trace.WriteLine( "Failed to parse dc:date " + dateStr + ": " + e.Message );
}
}
}
internal class RFCDateParser : BaseFeedElementParser
{
private readonly int _propID;
public RFCDateParser( int propID )
{
_propID = propID;
}
public override void ParseValue( IResource resource, XmlReader reader )
{
string dateStr = reader.ReadString();
try
{
resource.SetProp( _propID, RFC822DateParser.ParseDate( dateStr ) );
}
catch ( Exception e )
{
Trace.WriteLine( "Failed to parse RFC-822 date " + dateStr + ": " + e.Message );
}
}
}
internal class FeedAuthorParser : BaseFeedElementParser
{
private readonly Regex _creatorRX;
private readonly Regex _creatorRX2;
private readonly Regex _emailRX;
public FeedAuthorParser()
{
_creatorRX = new Regex( @"([^@]+@[^@]+)\s+\(([^()]+)\)" ); // person@domain.com (Name)
_creatorRX2 = new Regex( @"([^@]+)\s+\(([^@]+@[^@]+)\)" ); // name (person@domain.com)
_emailRX = new Regex( @"[^@]+@[^@]+" );
}
public override void ParseValue( IResource feed, XmlReader reader )
{
string creator = reader.ReadString();
ParseAuthorString( feed, creator );
}
internal void ParseAuthorString( IResource feed, string creator )
{
feed.SetProp( Props.Author, creator );
string email = null;
string name = null;
Match m = _creatorRX.Match( creator );
if ( m.Success )
{
email = m.Groups[ 1 ].Value;
name = m.Groups[ 2 ].Value;
}
else
{
m = _creatorRX2.Match( creator );
if ( m.Success )
{
name = m.Groups[ 1 ].Value;
email = m.Groups[ 2 ].Value;
}
else
{
m = _emailRX.Match( creator );
if ( m.Success )
{
email = m.Value;
}
}
}
if ( email != null )
{
if ( !feed.HasProp( Props.AuthorEmail ) )
{
IResource emailAcct = Core.ContactManager.FindOrCreateEmailAccount( email );
if ( emailAcct != null )
{
emailAcct.AddLink( Props.AuthorEmail, feed );
}
}
IContact contact = Core.ContactManager.FindOrCreateContact( email, name );
contact.Resource.AddLink( Props.Weblog, feed );
}
}
}
internal class ItemAuthorParser : BaseFeedElementParser
{
public override void ParseValue( IResource resource, XmlReader reader )
{
string author = reader.ReadString();
if ( author.Trim().Length > 0 )
{
IContact contact = Core.ContactManager.FindOrCreateContact( null, author );
Core.ContactManager.LinkContactToResource(
Core.ResourceStore.PropTypes[ "From" ].Id, contact.Resource, resource, (IResource)null, author );
}
}
}
internal class XhtmlBodyParser : IFeedElementParser
{
public void ParseValue( IResource resource, XmlReader reader )
{
resource.SetProp( Core.Props.LongBody, reader.ReadInnerXml() );
}
public bool SkipNextRead { get { return true; } }
}
#endregion RSS Elements Parsers
#region ATOM Elements Parsers
///
/// Generic parser for ATOM Link constructs.
/// Distinguish three distinct relations (by release 2.1.2), discriminated
/// by the value of "rel" tag:
/// - "alternate" value, which is an obligatory relation type
/// for an Atom entry.
/// - "related" value, which means any semantically close
/// external (!) link (from the point of view of post author, not Atom
/// standard).
/// - "enclosure" value, which describes an enclosure location, size and type.
///
/// NB: for a compatibility with illegally-formed ATOM feeds, there are
/// entries which contain "link" relations without any "rel" attribute.
///
internal class AtomEntryLinkParser : BaseFeedElementParser
{
public override void ParseValue( IResource resource, XmlReader reader )
{
string href = reader.GetAttribute( "href" );
string rel = reader.GetAttribute( "rel" );
if ( string.IsNullOrEmpty( rel ) )
{
rel = "alternate";
}
string linkBase = resource.GetPropText( Props.LinkBase );
if ( linkBase.Length > 0 )
{
try
{
href = new Uri( new Uri( linkBase ), href ).ToString();
}
catch( UriFormatException )
{
// ignore
}
}
if( rel == "enclosure" )
{
LinkEnclosureInformation( resource, href, reader );
}
else
if ( rel == "related" )
{
LinkRelatedInformation( resource, href, reader );
}
else
if ( rel == "alternate" )
{
resource.SetProp( Props.Link, href );
}
}
private static void LinkEnclosureInformation( IResource resource, string href, XmlReader reader )
{
string type = reader.GetAttribute( "type" );
resource.SetProp( Props.EnclosureURL, href );
if ( type.Length > 0 )
{
resource.SetProp( Props.EnclosureType, type );
}
// resource.SetProp( Props.EnclosureType, type );
string length = reader.GetAttribute( "length" );
if ( !string.IsNullOrEmpty( length ) )
{
try
{
int enclosureSize = Int32.Parse( length );
if ( enclosureSize < 0 )
{
enclosureSize = 0;
}
resource.SetProp( Props.EnclosureSize, enclosureSize );
}
catch ( FormatException )
{
Trace.WriteLine( "Failed to parse enclosure size " + reader.Value );
}
catch ( OverflowException )
{
Trace.WriteLine( "Enclosure size too large: " + reader.Value );
}
}
resource.SetProp( Props.EnclosureDownloadingState, DownloadState.NotDownloaded );
}
private static void LinkRelatedInformation( IResource resource, string href, XmlReader reader )
{
string title = reader.GetAttribute( "title" );
IResource newRelated = Core.ResourceStore.BeginNewResource( Props.RSSLinkedPostResource );
try
{
newRelated.SetProp( Props.URL, href );
if( !string.IsNullOrEmpty( title ))
newRelated.SetProp( Core.Props.Name, title );
}
finally
{
newRelated.EndUpdate();
}
resource.AddLink( Props.LinkedPost, newRelated );
}
}
///
/// Parser for ATOM Channel Link constructs.
///
internal class AtomChannelLinkParser : BaseFeedElementParser
{
private readonly int _propId;
private const string _expectRel = "alternate";
public AtomChannelLinkParser( int propId )
{
_propId = propId;
}
public override void ParseValue( IResource resource, XmlReader reader )
{
string rel = reader.GetAttribute( "rel" );
string href = reader.GetAttribute( "href" );
string linkBase = resource.GetPropText( Props.LinkBase );
if ( linkBase.Length > 0 )
{
try
{
href = new Uri( new Uri( linkBase ), href ).ToString();
}
catch( UriFormatException )
{
// ignore
}
}
if ( String.Compare(rel,_expectRel, true ) == 0 )
{
resource.SetProp( _propId, href );
}
}
}
///
/// Generic parser for ATOM Person constructs.
///
internal class AtomPersonParser : BaseFeedElementParser
{
private readonly int _propId;
public AtomPersonParser( int propID )
{
_propId = propID;
}
public override void ParseValue( IResource resource, XmlReader reader )
{
string name = null;
string url = null;
string email = null;
int startDepth = reader.Depth;
while ( reader.Read() )
{
if ( reader.NodeType == XmlNodeType.Element &&
( reader.NamespaceURI == RSSParser.NamespaceATOM03 || reader.NamespaceURI == RSSParser.NamespaceATOM10 ) )
{
if ( reader.LocalName == "name" )
{
name = reader.ReadString();
}
else if ( reader.LocalName == "url" || reader.LocalName == "uri" )
{
url = reader.ReadString();
}
else if ( reader.LocalName == "email" )
{
email = reader.ReadString();
}
}
else if ( reader.NodeType == XmlNodeType.EndElement )
{
if ( reader.Depth == startDepth )
{
break;
}
}
}
if ( !string.IsNullOrEmpty( name ) || !string.IsNullOrEmpty( email ) )
{
IContact contact = Core.ContactManager.FindOrCreateContact( email, name );
if ( _propId == Core.ContactManager.Props.LinkFrom )
{
// Fix OM-13266.
if( !resource.IsDeleting && !resource.IsDeleted )
{
Core.ContactManager.LinkContactToResource( Core.ContactManager.Props.LinkFrom,
contact.Resource, resource, email, name );
}
}
else
{
if ( _propId > 0 )
{
resource.AddLink( _propId, contact.Resource );
}
else
{
contact.Resource.AddLink( -_propId, resource );
}
}
if ( url != null && contact.HomePage == string.Empty )
{
contact.HomePage = url;
}
}
}
public override bool SkipNextRead { get { return true; } }
}
///
/// Generic parser for ATOM 0.3 Content constructs.
///
internal class AtomContentParser : BaseFeedElementParser
{
// default for this implementation.
private readonly TextFormat _expectedFormat = TextFormat.Html;
private readonly int _propID;
public AtomContentParser( int propID )
{
_propID = propID;
}
public AtomContentParser( int propID, TextFormat expectedFormat )
{
_propID = propID;
_expectedFormat = expectedFormat;
}
public override void ParseValue( IResource resource, XmlReader reader )
{
string content = null;
string mode = reader.GetAttribute( "mode" ) ?? "xml";
if( String.Compare( mode, "xml", true ) == 0 )
{
reader.Read();
if ( reader.NodeType == XmlNodeType.CDATA || reader.NodeType == XmlNodeType.Text )
{
content = reader.ReadString();
}
else
{
content = reader.ReadOuterXml();
if ( _expectedFormat == TextFormat.PlainText )
{
content = HtmlTools.StripHTML( content );
}
}
}
else
if( String.Compare( mode, "escaped", true ) == 0 )
{
content = HtmlTools.SafeHtmlDecode( reader.ReadString() );
}
else
if( String.Compare( mode, "base64", true ) == 0 )
{
/*
* NB, LloiX (17.03.2008)
* This is an arguable code. "base64" encoding is used to store data of a binary
* nature. It is both illogical and invalid to convert the result "byte[]" intermediate
* content to a string one.
* As an alternative we can emulate the notion of an "attachment" for this part of the
* content but that requires changes in the API and feed article processing.
content = reader.ReadString();
byte[] data = Convert.FromBase64String( content );
XmlValidatingReader valReader = (XmlValidatingReader)reader;
content = valReader.Encoding.GetString( data );
*/
Core.ReportBackgroundException( new ApplicationException( "RssParser -- Processing base64-mode content for text." ) );
}
if ( content != null )
{
resource.SetProp( _propID, content );
}
}
}
internal class AtomTitleParser : AtomTextParser
{
public AtomTitleParser() : base( Props.OriginalName, TextFormat.PlainText ) {}
public override void ParseValue( IResource resource, XmlReader reader )
{
base.ParseValue( resource, reader );
if( !resource.HasProp( Core.Props.Name ) )
resource.SetProp( Core.Props.Name, resource.GetStringProp( Props.OriginalName ) );
}
}
///
/// Generic parser for ATOM 1.0 Text constructs.
///
internal class AtomTextParser: BaseFeedElementParser
{
private readonly int _propId;
private readonly TextFormat _expectFormat;
public AtomTextParser( int propId, TextFormat expectFormat )
{
_propId = propId;
_expectFormat = expectFormat;
}
public override void ParseValue( IResource resource, XmlReader reader )
{
string type = reader.GetAttribute( "type" ) ?? "text";
string content = null;
if( String.Compare( type, "text", true) == 0 )
{
content = (_expectFormat == TextFormat.PlainText) ? reader.ReadString() :
HttpUtility.HtmlEncode( reader.ReadString() );
}
else if( String.Compare( type, "text/html", true ) == 0 ||
String.Compare( type, "html", true) == 0 )
{
content = (_expectFormat == TextFormat.PlainText) ? HtmlTools.StripHTML( reader.ReadString() ) :
reader.ReadString();
}
else if( String.Compare( type, "xhtml", true ) == 0 )
{
reader.Read(); // move to the xhtml:div element
content = reader.ReadInnerXml();
if ( _expectFormat == TextFormat.PlainText )
{
content = HtmlTools.StripHTML( content );
}
}
if ( content != null )
{
resource.SetProp( _propId, content );
}
}
}
internal class AtomCategoryParser: BaseFeedElementParser
{
public override void ParseValue( IResource resource, XmlReader reader )
{
string category = reader.GetAttribute( "label" );
if ( string.IsNullOrEmpty( category ) )
{
category = reader.GetAttribute( "term" );
}
if ( !string.IsNullOrEmpty( category ) )
{
resource.SetProp( Props.RSSCategory, category );
}
}
}
internal class AtomSourceParser: BaseFeedElementParser
{
public override void ParseValue( IResource resource, XmlReader reader )
{}
}
#endregion ATOM Elements Parsers
/**
* General parsing framework for RSS and ATOM feeds.
*/
internal class RSSParser
{
internal static IResource _nextItem = null;
private readonly IResourceStore _store;
private readonly IResource _feed;
private readonly bool _allowEqualPosts;
private readonly IResource _commentItem;
private readonly IResource _commentFeed;
private readonly IntHashSet _currentFeedItems = new IntHashSet();
private bool _foundChannel;
private HashSet _deletedItems;
private HashSet _newDeletedItems;
private DateTime _parseDate;
private bool _uniqueLinks;
private bool _uniqueLinksKnown = false;
private const string NamespaceRSS09 = "http://my.netscape.com/rdf/simple/0.9/";
private const string NamespaceRSS091 = "http://my.netscape.com/publish/formats/rss-0.91.dtd";
private const string NamespaceRSS093 = "http://backend.userland.com/rss093";
private const string NamespaceRSS10 = "http://purl.org/rss/1.0/";
private const string NamespaceRSS10_1 = "http://www.purl.org/rss/1.0/";
private const string NamespaceRSS20 = "http://backend.userland.com/rss2";
private const string NamespaceDC = "http://purl.org/dc/elements/1.1/";
private const string NamespaceHTML = "http://www.w3.org/1999/xhtml";
private const string NamespaceContent = "http://purl.org/rss/1.0/modules/content/";
private const string NamespaceSyndication = "http://purl.org/rss/1.0/modules/syndication/";
private const string NamespaceSlash = "http://purl.org/rss/1.0/modules/slash/";
public const string NamespaceATOM03 = "http://purl.org/atom/ns#";
public const string NamespaceATOM10 = "http://www.w3.org/2005/Atom";
private const string NamespaceWFW = "http://wellformedweb.org/CommentAPI/";
private static readonly string[] _rssNamespaces = new[]
{
"", NamespaceRSS09, NamespaceRSS091, NamespaceRSS093,
NamespaceRSS10, NamespaceRSS10_1, NamespaceRSS20
};
private static Hashtable _rssItemElements;
private static Hashtable _rssChannelElements;
private static Hashtable _atomItemElements;
private static Hashtable _atomChannelElements;
private static readonly Regex _rxLink = new Regex( "href=\"([^\"]+)\"" );
private static readonly Regex _rxLink2 = new Regex( "href='([^']+)'" );
internal event ResourceEventHandler ItemParsed;
public RSSParser( IResource feed )
{
_store = Core.ResourceStore;
CheckItemsRegister();
_feed = feed;
_allowEqualPosts = _feed.HasProp( Props.AllowEqualPosts );
_commentItem = _feed.GetLinkProp( Props.ItemCommentFeed );
_commentFeed = _feed.GetLinkProp( Props.FeedComment2Feed );
}
public void Dispose()
{
_rssItemElements = null;
}
#region Static Initialization
private static void CheckItemsRegister()
{
if ( _rssItemElements == null )
{
RegisterRSSElements();
RegisterAtomElements();
}
}
/**
* Registers the elements for the RSS feeds and items.
*/
private static void RegisterRSSElements()
{
_rssItemElements = CollectionsUtil.CreateCaseInsensitiveHashtable();
_rssChannelElements = CollectionsUtil.CreateCaseInsensitiveHashtable();
foreach ( string ns in _rssNamespaces )
{
RegisterRssStandardElements( ns );
}
_rssItemElements[ NamespaceContent + ":encoded" ] = new FeedElementParser( Core.Props.LongBody, true );
_rssItemElements[ NamespaceDC + ":date" ] = new DCDateParser( Core.Props.Date );
_rssItemElements[ NamespaceDC + ":creator" ] = new ItemAuthorParser();
_rssItemElements[ NamespaceHTML + ":body" ] = new XhtmlBodyParser();
_rssItemElements[ NamespaceSlash + ":comments" ] = new FeedElementParser( Props.CommentCount.Id );
_rssItemElements[ NamespaceWFW + ":commentRSS" ] = new FeedElementParser( Props.CommentRSS );
_rssItemElements[ NamespaceWFW + ":comment" ] = new FeedElementParser( Props.WfwComment );
_rssChannelElements[ NamespaceDC + ":creator" ] = new FeedAuthorParser();
_rssChannelElements[ NamespaceSyndication + ":updatePeriod" ] = new FeedElementParser( Props.UpdatePeriod );
_rssChannelElements[ NamespaceSyndication + ":updateFrequency" ] = new FeedElementParser( Props.UpdateFrequency );
RegisterCommonElements( _rssItemElements );
}
private static void RegisterRssStandardElements( string ns )
{
_rssItemElements[ ns + ":description" ] = new FeedElementParser( Core.Props.LongBody );
_rssItemElements[ ns + ":link" ] = new FeedElementParser( Props.Link );
_rssItemElements[ ns + ":category" ] = new FeedElementParser( Props.RSSCategory );
_rssItemElements[ ns + ":comments" ] = new FeedElementParser( Props.CommentURL );
_rssItemElements[ ns + ":guid" ] = new GUIDParser();
_rssItemElements[ ns + ":pubDate" ] = new RFCDateParser( Core.Props.Date );
_rssItemElements[ ns + ":title" ] = new TitleParser();
_rssItemElements[ ns + ":enclosure" ] = new EnclosureParser();
_rssItemElements[ ns + ":author" ] = new ItemAuthorParser();
_rssItemElements[ ns + ":source" ] = new SourceTagParser();
_rssChannelElements[ ns + ":managingEditor" ] = new FeedAuthorParser();
_rssChannelElements[ ns + ":pubDate" ] = new RFCDateParser( Props.PubDate );
_rssChannelElements[ ns + ":title" ] = new FeedNameParser();
_rssChannelElements[ ns + ":link" ] = new FeedElementParser( Props.HomePage );
_rssChannelElements[ ns + ":description" ] = new FeedElementParser( Props.Description );
_rssChannelElements[ ns + ":image" ] = new ImageParser();
}
/**
* Registers the elements for the Atom channels and items.
*/
private static void RegisterAtomElements()
{
_atomItemElements = CollectionsUtil.CreateCaseInsensitiveHashtable();
_atomChannelElements = CollectionsUtil.CreateCaseInsensitiveHashtable();
_atomChannelElements[ NamespaceATOM03 + ":title" ] = new FeedNameParser();
_atomChannelElements[ NamespaceATOM10 + ":title" ] = new AtomTitleParser();
//
_atomChannelElements[ NamespaceATOM03 + ":link" ] = new AtomChannelLinkParser( Props.HomePage );
_atomChannelElements[ NamespaceATOM10 + ":link" ] = new AtomChannelLinkParser( Props.HomePage );
_atomChannelElements[ NamespaceATOM03 + ":author" ] = new AtomPersonParser( -Props.Weblog );
_atomChannelElements[ NamespaceATOM10 + ":author" ] = new AtomPersonParser( -Props.Weblog );
_atomChannelElements[ NamespaceATOM03 + ":tagline" ] = new AtomContentParser( Props.Description );
_atomChannelElements[ NamespaceATOM10 + ":subtitle" ] = new AtomTextParser( Props.Description, TextFormat.PlainText );
_atomChannelElements[ NamespaceATOM10 + ":logo" ] = new FeedElementParser( Props.ImageURL );
_atomItemElements[ NamespaceATOM03 + ":title" ] = new AtomContentParser( Core.Props.Subject, TextFormat.PlainText );
_atomItemElements[ NamespaceATOM10 + ":title" ] = new AtomTextParser( Core.Props.Subject, TextFormat.PlainText );
_atomItemElements[ NamespaceATOM03 + ":link" ] = new AtomEntryLinkParser();
_atomItemElements[ NamespaceATOM10 + ":link" ] = new AtomEntryLinkParser();
_atomItemElements[ NamespaceATOM03 + ":author" ] = new AtomPersonParser( Core.ContactManager.Props.LinkFrom );
_atomItemElements[ NamespaceATOM10 + ":author" ] = new AtomPersonParser( Core.ContactManager.Props.LinkFrom );
_atomItemElements[ NamespaceATOM03 + ":id" ] = new FeedElementParser( Props.GUID );
_atomItemElements[ NamespaceATOM10 + ":id" ] = new FeedElementParser( Props.GUID );
_atomItemElements[ NamespaceATOM03 + ":created" ] = new DCDateParser( Core.Props.Date );
_atomItemElements[ NamespaceATOM10 + ":published" ] = new DCDateParser( Core.Props.Date );
_atomItemElements[ NamespaceATOM03 + ":modified" ] = new DCDateParser( Props.DateModified );
_atomItemElements[ NamespaceATOM10 + ":updated" ] = new DCDateParser( Props.DateModified );
_atomItemElements[ NamespaceATOM03 + ":summary" ] = new AtomContentParser( Props.Summary );
_atomItemElements[ NamespaceATOM10 + ":summary" ] = new AtomTextParser( Props.Summary, TextFormat.Html );
_atomItemElements[ NamespaceATOM03 + ":content" ] = new AtomContentParser( Core.Props.LongBody );
_atomItemElements[ NamespaceATOM10 + ":content" ] = new AtomTextParser( Core.Props.LongBody, TextFormat.Html );
_atomItemElements[ NamespaceATOM10 + ":category" ] = new AtomCategoryParser();
_atomItemElements[ NamespaceATOM10 + ":source" ] = new AtomSourceParser();
RegisterCommonElements( _atomItemElements );
}
/**
* Registers the elements common for RSS and ATOM feeds.
*/
private static void RegisterCommonElements( Hashtable itemElements )
{
itemElements[ NamespaceDC + ":subject" ] = new FeedElementParser( Props.RSSCategory );
}
public static void RegisterChannelElementParser( FeedType type, string xmlNameSpace, string elementName,
IFeedElementParser parser )
{
CheckItemsRegister();
switch ( type )
{
case FeedType.Rss:
_rssChannelElements[ xmlNameSpace + ":" + elementName ] = parser;
break;
case FeedType.Atom:
_atomChannelElements[ xmlNameSpace + ":" + elementName ] = parser;
break;
}
}
public static void RegisterItemElementParser( FeedType type, string xmlNameSpace, string elementName,
IFeedElementParser parser )
{
CheckItemsRegister();
switch ( type )
{
case FeedType.Rss:
_rssItemElements[ xmlNameSpace + ":" + elementName ] = parser;
break;
case FeedType.Atom:
_atomItemElements[ xmlNameSpace + ":" + elementName ] = parser;
break;
}
}
#endregion Static Initialization
public void Parse( Stream stream, Encoding encoding, bool parseItems )
{
_parseDate = DateTime.Now;
_foundChannel = false;
FillDeletedItemsSet();
string encodingName = ( encoding == null ) ? null : encoding.BodyName;
XmlPreparer preparer = new XmlPreparer( stream, encodingName );
XmlTextReader baseReader;
if ( preparer.PrepareXML() )
{
NameTable nt = new NameTable();
XmlNamespaceManager nsmgr = new LooseNSManager( nt );
XmlParserContext ctx = new XmlParserContext( nt, nsmgr, null, XmlSpace.None);
string xml = preparer.GetXML();
baseReader = new XmlTextReader( xml, XmlNodeType.Document, ctx );
}
else
{
Trace.WriteLine( "Can not process feed '" + _feed.DisplayName + "' with preparer\n" );
baseReader = (encoding == null) ? new XmlTextReader( stream ) :
new XmlTextReader( new StreamReader( stream, encoding ) );
}
// Following two lines are obsolete?
baseReader.WhitespaceHandling = WhitespaceHandling.None;
baseReader.XmlResolver = null;
XmlReaderSettings settings = new XmlReaderSettings();
settings.ValidationType = ValidationType.None;
settings.XmlResolver = null;
XmlReader reader = XmlReader.Create( baseReader, settings );
while ( reader.Read() )
{
if ( reader.NodeType == XmlNodeType.Element &&
( reader.LocalName == "channel" ||
( ( reader.NamespaceURI == NamespaceATOM03 || reader.NamespaceURI == NamespaceATOM10 ) && reader.LocalName == "feed" ) ) )
{
if ( reader.LocalName == "channel" )
{
if ( reader.NamespaceURI.Length > 0 && Array.IndexOf( _rssNamespaces, reader.NamespaceURI ) < 0 )
{
RegisterRssStandardElements( reader.NamespaceURI );
}
ParseChannel( reader, parseItems, "channel", "item", _rssChannelElements, _rssItemElements );
}
else
{
ParseChannel( reader, parseItems, "feed", "entry", _atomChannelElements, _atomItemElements );
}
SaveDeletedItemsSet();
_foundChannel = true;
}
}
}
public bool FoundChannel { get { return _foundChannel; } }
/**
* Parses the DeletedItems property of a feed to get the HashSet of
* items which were marked as deleted after previous parse, and clears
* the HashSet of items that will be marked as deleted after the current
* parse.
*/
private void FillDeletedItemsSet()
{
_deletedItems = new HashSet();
_newDeletedItems = new HashSet();
foreach ( string delItem in _feed.GetStringListProp( Props.DeletedItemHashList ) )
{
_deletedItems.Add( delItem );
}
}
/**
* Saves the new set of deleted item hashes to the feed property.
*/
private void SaveDeletedItemsSet()
{
IStringList delItems = _feed.GetStringListProp( Props.DeletedItemHashList );
foreach ( HashSet.Entry he in _deletedItems )
{
if ( !_newDeletedItems.Contains( he.Key ) )
{
delItems.Remove( (string)he.Key );
}
}
delItems.Dispose();
}
private bool GetUniqueLinks()
{
if ( !_uniqueLinksKnown )
{
if ( _feed.HasProp( Props.UniqueLinks ) )
{
_uniqueLinks = _feed.GetIntProp( Props.UniqueLinks ) == 1;
}
else
{
_uniqueLinks = CheckUniqueLinks();
if ( !_uniqueLinks )
{
_feed.SetProp( Props.UniqueLinks, 0 );
}
else if ( _feed.GetLinkCount( Props.RSSItem ) >= 50 )
{
_feed.SetProp( Props.UniqueLinks, 1 );
}
}
_uniqueLinksKnown = true;
}
return _uniqueLinks;
}
/**
* Checks if all links in the feed point to different pages and can be
* used t
*/
private bool CheckUniqueLinks()
{
HashSet linkValues = new HashSet();
foreach ( IResource item in _feed.GetLinksFrom( "RSSItem", Props.RSSItem ) )
{
string link = item.GetStringProp( Props.Link );
if ( link != null )
{
if ( linkValues.Contains( link ) )
{
return false;
}
linkValues.Add( link );
}
}
return true;
}
/**
* Parses an RSS or ATOM channel using the specified maps for channel and item elements.
*/
private void ParseChannel( XmlReader reader, bool parseItems,
string feedLocalName, string itemLocalName,
Hashtable channelElements, Hashtable itemElements )
{
string xmlBase = reader.GetAttribute( "xml:base" );
if ( !string.IsNullOrEmpty( xmlBase ) )
{
_feed.SetProp( Props.LinkBase, xmlBase );
}
int startDepth = reader.Depth;
bool channelDone = false;
while( Core.State != CoreState.ShuttingDown && reader.Read() )
{
if ( reader.NodeType == XmlNodeType.Element )
{
if ( reader.LocalName == itemLocalName && parseItems )
{
if( _nextItem == null )
{
_nextItem = _store.NewResourceTransient( "RSSItem" );
}
if( _nextItem.HasProp( Props._propFake ))
throw new ApplicationException( "Feed-Post cleaning violation" );
_nextItem.SetProp( Props._propFake, _feed.Id );
ParseItem( reader, _nextItem, itemElements );
AddOrUpdateItemToFeed( _nextItem );
}
else if ( reader.Depth == startDepth + 1 && !channelDone )
{
string attrName = reader.NamespaceURI + ":" + reader.LocalName;
IFeedElementParser parser = (IFeedElementParser)channelElements[ attrName ];
if ( parser != null )
{
parser.ParseValue( _feed, reader );
}
}
}
else if ( reader.NodeType == XmlNodeType.EndElement && reader.LocalName == feedLocalName )
{
channelDone = true;
}
}
}
/**
* Parses an RSS or ATOM item using the specified map for item elements.
*/
private void ParseItem( XmlReader reader, IResource item, Hashtable itemElements )
{
Trace.WriteLineIf( Settings.Trace, "Parsing new item" );
string xmlBase = reader.GetAttribute( "xml:base" );
if ( !string.IsNullOrEmpty( xmlBase ) )
{
string channelXmlBase = _feed.GetPropText( Props.LinkBase );
if ( channelXmlBase.Length > 0 )
{
try
{
xmlBase = new Uri( new Uri( channelXmlBase ), xmlBase ).ToString();
}
catch( UriFormatException )
{
// ignore
}
}
item.SetProp( Props.LinkBase, xmlBase );
}
int startDepth = reader.Depth;
bool skipNextRead = false;
while ( skipNextRead || reader.Read() )
{
if ( reader.NodeType == XmlNodeType.Element && reader.Depth == startDepth + 1 )
{
string attrName = reader.NamespaceURI + ":" + reader.LocalName;
IFeedElementParser parser = (IFeedElementParser)itemElements[ attrName ];
if ( parser != null )
{
if( Settings.Trace )
{
Trace.WriteLine( "Invoking handler " + parser + " for element " + attrName );
}
parser.ParseValue( item, reader );
skipNextRead = parser.SkipNextRead;
}
else
{
if( Settings.Trace )
{
Trace.WriteLine( "Handler not found for element " + attrName );
}
skipNextRead = false;
}
}
else if ( reader.NodeType == XmlNodeType.EndElement )
{
if ( reader.Depth == startDepth )
{
return;
}
skipNextRead = false;
}
}
}
private void AddOrUpdateItemToFeed( IResource item )
{
Guard.NullArgument( item, "item" );
if( Settings.Trace )
{
Trace.WriteLine( "Parsed item with subject " + item.GetPropText( Core.Props.Subject ) );
}
if ( ItemParsed != null )
{
ItemParsed( this, new ResourceEventArgs( item ) );
if ( item.IsDeleted )
{
return;
}
}
IResource oldItem = GetExistingItem( item );
if ( oldItem == null )
{
AddItemToFeed( item );
}
else
{
UpdateItemToFeed( item, oldItem );
}
}
private static void UpdateProp( IResource item, IResource oldItem, int propId )
{
if ( item.HasProp( propId ) )
{
oldItem.SetProp( propId, item.GetProp( propId ) );
}
}
private static void UpdateProp(IResource item, IResource oldItem, PropId propId)
{
if (item.HasProp(propId))
{
oldItem.SetProp(propId, item.GetProp(propId));
}
}
private void UpdateItemToFeed(IResource item, IResource oldItem)
{
try
{
Guard.NullArgument( item, "item" );
Guard.NullArgument( oldItem, "oldItem" );
string subject = item.GetPropText( Core.Props.Subject );
if ( subject.Length > 0 )
{
oldItem.SetProp( Core.Props.Subject, subject );
}
UpdateBodyAndSize( item, oldItem );
UpdateProp( item, oldItem, Props.CommentCount );
UpdateProp( item, oldItem, Props.WfwComment );
UpdateProp( item, oldItem, Props.RSSSourceTag );
UpdateProp( item, oldItem, Props.RSSCategory );
UpdateProp( item, oldItem, Props.RSSSourceTagUrl );
if ( !oldItem.HasProp( Props.EnclosureDownloadingState ) && item.HasProp( Props.EnclosureDownloadingState ) )
{
UpdateProp( item, oldItem, Props.EnclosureDownloadingState );
}
}
finally
{
int newId = item.GetIntProp( Props._propFake );
int id = oldItem.GetLinksOfType( Props.RSSFeedResource, Props.RSSItem )[ 0 ].Id;
item.ClearProperties();
if( id != newId )
throw new ApplicationException( "Feed-Post update linkage violation - Feed ids do not coinside.");
if( _feed.Id != newId )
throw new ApplicationException( "Feed-Post update linkage violation - Feed id do not coinside with newItem id.");
}
}
private void AddItemToFeed( IResource item )
{
int linksCount = 0;
Guard.NullArgument( item, "item" );
_currentFeedItems.Add( item.Id );
if ( IsDeletedItem( item ) )
{
item.ClearProperties();
return;
}
try
{
int feedIndex = _feed.GetIntProp( Props.LastItemIndex );
item.SetProp( Props.IndexInFeed, feedIndex + 1 );
_feed.SetProp( Props.LastItemIndex, feedIndex + 1 );
_feed.AddLink( Props.RSSItem, item );
SetItemDate( item );
SetAuthor( item );
ExtractLinksAndReplies( item );
SetCommentLinks( item );
AssignFeedCategories( item );
item.SetProp( Core.Props.IsUnread, true );
item.SetProp( Core.Props.LongBodyIsHTML, true );
item.SetProp( Props.DownloadDate, DateTime.Now );
item.EndUpdate();
linksCount = item.GetLinksTo( Props.RSSFeedResource, Props.RSSItem ).Count;
}
finally
{
_nextItem = null;
if( linksCount != 1 )
throw new ApplicationException( "Feed-Post linkage violation: amount of links exceeds 1 = " + linksCount );
if( item.GetIntProp( Props._propFake ) != _feed.Id )
throw new ApplicationException( "Feed-Post linkage violation" );
}
Core.TextIndexManager.QueryIndexing( item.Id );
Core.FilterEngine.ExecRules( StandardEvents.ResourceReceived, item );
}
private void SetItemDate( IResource item )
{
if ( !item.HasProp( Core.Props.Date ) )
{
if ( item.HasProp( Props.DateModified ) )
{
item.SetProp( Core.Props.Date, item.GetDateProp( Props.DateModified ) );
}
else if ( _feed.HasProp( Props.PubDate ) )
{
item.SetProp( Core.Props.Date, _feed.GetDateProp( Props.PubDate ) );
}
else
{
item.SetProp( Core.Props.Date, _parseDate );
}
}
}
private void SetAuthor( IResource item )
{
if ( !item.HasProp( Core.ContactManager.Props.LinkFrom ) )
{
IResourceList authorList = _feed.GetLinksOfType( "Contact", Props.Weblog );
if ( authorList.Count > 0 )
{
item.AddLink( Core.ContactManager.Props.LinkFrom, authorList[ 0 ] );
}
else
{
item.AddLink( Core.ContactManager.Props.LinkFrom, _feed );
}
}
else
{
// Author of the feed item is linked as the contact resource.
// Set it as feed author if it is not set yet.
IResource author = item.GetLinkProp( Core.ContactManager.Props.LinkFrom );
if ( !_feed.HasProp( Props.Author ) )
{
_feed.SetProp( Props.Author, author.DisplayName );
}
}
}
private void SetCommentLinks( IResource item )
{
if ( _commentItem != null )
{
item.AddLink( Props.ItemComment, _commentItem );
}
if ( _commentFeed != null )
{
item.AddLink( Props.FeedComment, _commentFeed );
}
}
private void AssignFeedCategories( IResource item )
{
IResourceList feedCategs = _feed.GetLinksOfType( "Category", "Category" );
foreach( IResource category in feedCategs )
{
item.AddLink( "Category", category );
}
}
private IResource PrepareSubjectAndBody( IResource item )
{
if ( item.GetPropText( Core.Props.Subject ).Length == 0 )
{
CreateDefaultSubject( item );
}
int propId = item.HasProp( Core.Props.LongBody ) ? Core.Props.LongBody : Props.Summary;
string longBody = item.GetPropText( propId );
string subject = item.GetPropText( Core.Props.Subject );
// First try to find the duplicate without transformations of
// possible relative links sinch they are rare.
IResource candidate = FindByHash( item, subject, longBody );
if ( candidate == null )
{
string fixedBody = longBody;
string baseUrl = item.GetPropText( Props.LinkBase );
if ( baseUrl.Length == 0 )
{
baseUrl = _feed.GetPropText( Props.URL );
}
if ( baseUrl.Length > 0 )
{
fixedBody = HtmlTools.FixRelativeLinks( longBody, baseUrl );
}
if( fixedBody.Equals( longBody ) )
{
UpdateBody( item, longBody, subject );
}
else
{
UpdateBody( item, fixedBody, subject );
candidate = FindByHash( item, subject, fixedBody );
}
}
return candidate;
}
private IResource FindByHash( IResource item, string subject, string longBody )
{
int hash = Utils.GetHashCodeInLowerCase( subject, longBody );
item.SetProp( Props.RssLongBodyCRC, hash );
IResourceList list = Core.ResourceStore.FindResources( null, Props.RssLongBodyCRC, hash );
if( list.Count > 0 )
{
list = list.Intersect( _feed.GetLinksFrom( null, Props.RSSItem ), true );
foreach ( IResource candidate in list.ValidResources )
{
if ( subject.Equals( candidate.GetPropText( Core.Props.Subject ) ) &&
longBody.Equals( candidate.GetPropText( Core.Props.LongBody ) ) )
{
return candidate;
}
}
}
return null;
}
private static void UpdateBody( IResource item, string longBody, string subject )
{
item.SetProp( Core.Props.LongBody, longBody );
int size = longBody.Length;
if ( size == 0 )
size = subject.Length;
item.SetProp( Core.Props.Size, size );
}
private static void UpdateBodyAndSize( IResource fromItem, IResource toItem )
{
int propId = fromItem.HasProp( Core.Props.LongBody ) ? Core.Props.LongBody : Props.Summary;
string body = fromItem.GetPropText( propId );
toItem.SetProp( Core.Props.LongBody, body );
toItem.SetProp( Props.RssLongBodyCRC, fromItem.GetProp( Props.RssLongBodyCRC ) );
int size = body.Length;
if ( size == 0 )
size = toItem.GetPropText( Core.Props.Subject ).Length;
toItem.SetProp( Core.Props.Size, size );
}
private IResource GetExistingItem( IResource item )
{
IResource oldItem = GetSamePropItem( item, Props.GUID, true );
if ( oldItem == null )
{
if ( !item.HasProp( Props.GUID ) )
{
bool hasLink = GetUniqueLinks() && item.HasProp( Props.Link );
if ( hasLink )
{
oldItem = GetSamePropItem( item, Props.Link, false );
if ( oldItem != null && Settings.Trace )
{
Trace.WriteLine( "Found item with same link " + item.GetPropText( Props.Link ) );
}
}
else if ( item.HasProp( Core.Props.Date ) )
{
oldItem = GetSamePropItem( item, Core.Props.Date, false );
if ( oldItem != null && Settings.Trace )
{
Trace.WriteLine( "Found item with same date " + item.GetPropText( Core.Props.Date ) );
}
}
else
{
oldItem = GetSamePropItem( item, Core.Props.Subject, false ); // the LongBody is not searchable
if ( oldItem != null && Settings.Trace )
{
Trace.WriteLine( "Found item with same subject " + item.GetPropText( Core.Props.Subject ) );
}
}
}
}
else
{
if( Settings.Trace )
{
Trace.WriteLine( "Found item with same GUID " + item.GetPropText( Props.GUID ) );
}
}
IResource candidate = PrepareSubjectAndBody( item );
if ( oldItem == null && !_allowEqualPosts )
{
oldItem = candidate;
if ( candidate != null && Settings.Trace )
{
Trace.WriteLine( "Found item by CRC " + item.GetPropText( Props.RssLongBodyCRC ) );
}
}
return oldItem;
}
private IResource GetSamePropItem( IResource item, int propId, bool caseSensitive )
{
if ( item.HasProp( propId ) )
{
object itemProp = item.GetProp( propId );
IResourceList list = _store.FindResources( "RSSItem", propId, itemProp );
list = list.Intersect( _feed.GetLinksFrom( null, Props.RSSItem ), true );
foreach ( IResource samePropItem in list )
{
if ( caseSensitive && !samePropItem.GetProp( propId ).Equals( itemProp ) )
{
continue;
}
// two items which are present in the feed at the same time are never duplicates
if ( samePropItem.Id != item.Id && !_currentFeedItems.Contains( samePropItem.Id ) )
{
return samePropItem;
}
}
}
return null;
}
private static void CreateDefaultSubject( IResource item )
{
const int MAX_DESC_LENGTH = 100;
if ( item.HasProp( Core.Props.LongBody ) )
{
string subj = HtmlTools.ReplaceLineBreaks( item.GetPropText( Core.Props.LongBody ) );
subj = HtmlTools.StripHTML( subj ).Trim();
int lineBreakIndex = subj.IndexOf( "\n" );
if ( lineBreakIndex != -1 )
{
subj = subj.Substring( 0, lineBreakIndex );
}
subj = HtmlTools.SafeHtmlDecode( subj );
subj = NormalizeWhiteSpace( subj );
if ( subj.Length > MAX_DESC_LENGTH )
{
int pos = MAX_DESC_LENGTH;
while ( pos >= 0 && subj[ pos ] != ' ' )
{
pos--;
}
while ( pos >= 0 && subj[ pos ] == ' ' )
{
pos--;
}
if ( pos > 0 )
{
subj = subj.Substring( 0, pos + 1 ) + "...";
}
else
{
subj = subj.Substring( 0, MAX_DESC_LENGTH );
}
}
item.SetProp( Core.Props.Subject, subj );
}
}
private static void ExtractLinksAndReplies( IResource item )
{
string link = item.GetStringProp( Props.Link );
if ( link != null )
{
IResourceList replies = Core.ResourceStore.FindResources( "RSSItem", Props.LinkList, link );
foreach ( IResource reply in replies )
{
if ( !item.HasLink( Props.LinkedPost, reply ) )
{
reply.AddLink( Props.LinkedPost, item );
}
}
}
IStringList linkList = item.GetStringListProp( Props.LinkList );
AddLinkMatches( linkList, item, _rxLink );
AddLinkMatches( linkList, item, _rxLink2 );
}
private static void AddLinkMatches( IStringList linkList, IResource item, Regex rxLink )
{
string body = item.GetPropText( Core.Props.LongBody );
if ( body.Length == 0 )
{
return;
}
foreach ( Match m in rxLink.Matches( body ) )
{
string link = m.Groups[ 1 ].Value.Trim();
linkList.Add( link );
IResourceList repliesTo = Core.ResourceStore.FindResources( "RSSItem", Props.Link, link );
foreach ( IResource res in repliesTo )
{
if ( !res.HasLink( Props.LinkedPost, item ) )
{
item.AddLink( Props.LinkedPost, res );
}
}
}
}
private bool IsDeletedItem( IResource item )
{
string md5 = GetRSSItemMD5( item );
if ( _deletedItems.Contains( md5 ) )
{
_newDeletedItems.Add( md5 );
return true;
}
return false;
}
public static string GetRSSItemMD5( IResource item )
{
Debug.Assert( item.Type == "RSSItem" );
string body = item.GetPropText( Core.Props.Subject ) + item.GetPropText( Core.Props.LongBody );
MD5 md5 = MD5.Create();
byte[] md5hash = md5.ComputeHash( Encoding.UTF8.GetBytes( body ) );
return Convert.ToBase64String( md5hash );
}
private static string NormalizeWhiteSpace( string s )
{
StringBuilder result = StringBuilderPool.Alloc();
try
{
char lastChar = '\0';
for ( int i = 0; i < s.Length; i++ )
{
char c = s[ i ];
if ( c == '\n' || c == '\r' || c == '\t' )
{
c = ' ';
}
if ( c != ' ' || lastChar != ' ' )
{
lastChar = c;
result.Append( c );
}
}
return result.ToString();
}
finally
{
StringBuilderPool.Dispose( result );
}
}
}
}