///
/// Copyright © 2003-2008 JetBrains s.r.o.
/// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder).
///
using System;
using System.Collections;
using System.IO;
using System.Text;
using JetBrains.Omea.OpenAPI;
using JetBrains.Omea.Containers;
using JetBrains.Omea.HTML;
using JetBrains.Omea.HttpTools;
using JetBrains.DataStructures;
namespace JetBrains.Omea.RSSPlugin
{
///
/// Discovers the RSS feed for the specified Web page.
///
internal class RSSDiscover: AbstractJob
{
private Uri _baseUri;
private RSSDiscoverResults _results;
private PriorityQueue _candidateURLs;
private HashSet _candidateURLSet;
private int _lastPriority;
private IResource _lastFeed;
private RSSUnitOfWork _lastUnitOfWork;
private HashMap _candidateHintTexts;
private string _lastCandidateURL;
private bool _downloadResults = true;
public event DownloadProgressEventHandler DiscoverProgress;
public event EventHandler DiscoverDone;
public RSSDiscoverResults Results
{
get { return _results; }
}
public void StartDiscover( string url, Stream readStream, string charset )
{
OnDiscoverProgress( "Discovering..." );
_baseUri = new Uri( url );
_results = new RSSDiscoverResults();
_candidateURLs = new PriorityQueue();
_candidateHintTexts = new HashMap();
_candidateURLSet = new HashSet();
using( HTMLParser parser = new HTMLParser( OpenHTMLReader( readStream, charset) ) )
{
parser.AddTagHandler( "link", new HTMLParser.TagHandler( OnLinkTag ) );
parser.AddTagHandler( "a", new HTMLParser.TagHandler( OnATag ) );
parser.AddTagHandler( "/a", new HTMLParser.TagHandler( OnEndATag ) );
while( !parser.Finished )
{
string fragment = parser.ReadNextFragment();
if ( _lastCandidateURL != null )
{
_candidateHintTexts [_lastCandidateURL] = fragment.Trim();
_lastCandidateURL = null;
}
}
_lastPriority = -1;
if ( _downloadResults )
{
ParseNextCandidate();
}
}
}
internal bool DownloadResults
{
set { _downloadResults = value; }
}
internal PriorityQueue CandidateURLs
{
get { return _candidateURLs; }
}
protected override void Execute() {}
private void OnLinkTag( HTMLParser instance, string tag )
{
HashMap attrMap = instance.ParseAttributes( tag );
if ( (string) attrMap ["rel"] == "alternate" &&
((string) attrMap ["type"] == "application/rss+xml" || (string) attrMap ["type"] == "application/atom+xml" ) )
{
string href = (string) attrMap ["href"];
if ( href != null )
{
string url;
try
{
url = new Uri( _baseUri, href ).ToString();
}
catch( UriFormatException )
{
return;
}
if ( !HttpReader.IsSupportedProtocol( url ) )
{
return;
}
_candidateURLs.Push( 10, url );
_candidateURLSet.Add( url );
_candidateHintTexts [url] = attrMap ["title"];
}
}
}
private void OnATag( HTMLParser instance, string tag )
{
if ( instance.InScript )
return;
HashMap attrMap = instance.ParseAttributes( tag );
string href = (string) attrMap ["href"];
if ( href == null )
return;
if ( href.StartsWith( "feed:" ) )
{
href = "http:" + href.Substring( 5 );
}
Uri hrefUri;
try
{
hrefUri = new Uri( _baseUri, href );
}
catch( Exception )
{
// sometimes generic exceptions are thrown from Uri constructor (see OM-9323)
return;
}
string hrefUriString;
try
{
/*
OM-12523.
System.UriFormatException: Invalid URI: The hostname could not be parsed.
at System.Uri.CreateHostStringHelper(String str, UInt16 idx, UInt16 end, Flags& flags, String& scopeId)
at System.Uri.CreateHostString()
at System.Uri.EnsureHostString(Boolean allowDnsOptimization)
at System.Uri.GetComponentsHelper(UriComponents uriComponents, UriFormat uriFormat)
at System.Uri.ToString()
*/
hrefUriString = hrefUri.ToString();
}
catch( System.UriFormatException )
{
return;
}
if ( !HttpReader.IsSupportedProtocol( hrefUriString ) )
{
return;
}
bool sameServer = ( String.Compare( _baseUri.Host, hrefUri.Host, true) == 0 );
int pos = href.LastIndexOf( "." );
string ext = ( pos < 0 ) ? "" : href.Substring( pos ).ToLower();
int priority = 0;
if ( ext == ".rss" || ext == ".rdf" || ext == ".xml" )
{
priority = sameServer ? 9 : 7;
}
else
{
href = href.ToLower();
if ( href.IndexOf( "rss" ) >= 0 || href.IndexOf( "rdf" ) >= 0 || href.IndexOf( "xml" ) >= 0 )
{
priority = sameServer ? 8 : 6;
}
}
if ( priority != 0 )
{
if ( !_candidateURLSet.Contains( hrefUriString ) )
{
_lastCandidateURL = hrefUriString;
_candidateURLSet.Add( _lastCandidateURL );
_candidateURLs.Push( priority, _lastCandidateURL );
}
}
}
private void OnEndATag( HTMLParser instance, string tag )
{
// don't use fragment text for RSS title if the fragment is after the closing tag
_lastCandidateURL = null;
}
private TextReader OpenHTMLReader( Stream readStream, string charset )
{
if ( charset == null )
{
readStream.Seek( 0, SeekOrigin.Begin );
charset = HtmlTools.DetectCharset( new StreamReader( readStream ) );
}
readStream.Seek( 0, SeekOrigin.Begin );
Encoding enc;
try
{
enc = Encoding.GetEncoding( charset );
}
catch( Exception )
{
enc = Encoding.Default;
}
return new StreamReader( readStream, enc );
}
private void ParseNextCandidate()
{
PriorityQueue.QueueEntry qEntry = _candidateURLs.PopEntry();
if ( qEntry == null )
{
OnDiscoverDone();
return;
}
if ( _lastPriority != -1 && qEntry.Priority != _lastPriority && qEntry.Priority < 9 && _results.Count > 0 )
{
OnDiscoverDone();
return;
}
_lastPriority = qEntry.Priority;
ResourceProxy newFeedProxy = ResourceProxy.BeginNewResource( "RSSFeed" );
newFeedProxy.SetProp( "Transient", 1 );
newFeedProxy.SetProp( "URL", (string) qEntry.Value );
newFeedProxy.EndUpdate();
_lastFeed = newFeedProxy.Resource;
_lastUnitOfWork = new RSSUnitOfWork( _lastFeed, false, true );
_lastUnitOfWork.DownloadProgress += new DownloadProgressEventHandler( RSSDownloadProgress );
_lastUnitOfWork.ParseDone += new EventHandler( RSSParseDone );
Core.NetworkAP.QueueJob( _lastUnitOfWork );
}
private void RSSDownloadProgress( object sender, DownloadProgressEventArgs e )
{
RSSUnitOfWork uow = (RSSUnitOfWork) sender;
OnDiscoverProgress( "Trying " + uow.FeedURL + " (" + e.SizesToString() + ")..." );
}
private void RSSParseDone( object sender, EventArgs e )
{
if ( _lastUnitOfWork.Status == RSSWorkStatus.Success )
{
string name = _lastFeed.GetStringProp( Core.Props.Name );
string url = _lastFeed.GetStringProp( Props.URL );
string hintText = (string) _candidateHintTexts [url];
if ( name == null )
{
name = url;
}
_results.Add( new RSSDiscoverResult( url, name, hintText ) );
}
new ResourceProxy( _lastFeed ).DeleteAsync();
ParseNextCandidate();
}
/*
public void SearchSyndic8()
{
OnDiscoverProgress( "Searching Syndic8..." );
_syndic8Proxy = new Syndic8();
_syndic8Proxy.KeepAlive = false;
InvokeAfterWait( new MethodInvoker( Syndic8FindFeeds ), null );
Core.NetworkAP.QueueJob( JobPriority.Immediate, this );
}
private void Syndic8FindFeeds()
{
OnDiscoverProgress( "Finding feeds on Syndic8..." );
_syndic8QueryString = _baseUri.Host.ToLower();
if ( _syndic8QueryString.StartsWith( "www." ) )
{
_syndic8QueryString = _syndic8QueryString.Substring( 4 );
}
_syndic8AR = _syndic8Proxy.BeginFindSites( _syndic8QueryString, null, null );
if ( _syndic8AR.IsCompleted )
{
Syndic8GetFeedInfo();
return;
}
InvokeAfterWait( new MethodInvoker( Syndic8GetFeedInfo ), _syndic8AR.AsyncWaitHandle );
}
private void Syndic8GetFeedInfo()
{
OnDiscoverProgress( "Getting feed info on Syndic8..." );
int [] feedIDs;
try
{
feedIDs = _syndic8Proxy.EndFindSites( _syndic8AR );
}
catch( Exception e )
{
Trace.WriteLine( "Syndic8.com EndFindSites exception: " + e.ToString() );
// ignore Syndic8.com XML-RPC call errors
feedIDs = new int [0];
}
if ( feedIDs.Length == 0 )
{
OnDiscoverDone();
return;
}
_syndic8AR = _syndic8Proxy.BeginGetFeedInfo( feedIDs, null, null );
if ( _syndic8AR.IsCompleted )
{
Syndic8ProcessFeedInfo();
return;
}
InvokeAfterWait( new MethodInvoker( Syndic8ProcessFeedInfo ), _syndic8AR.AsyncWaitHandle );
}
private void Syndic8ProcessFeedInfo()
{
object[] feedInfos;
try
{
feedInfos = _syndic8Proxy.EndGetFeedInfo( _syndic8AR );
}
catch( Exception e )
{
Trace.WriteLine( "Syndic8.com EndGetFeedInfo exception: " + e.ToString() );
OnDiscoverDone();
return;
}
foreach( XmlRpcStruct feedInfo in feedInfos )
{
string status = (string) feedInfo ["status"];
if ( status != "Dead" && status != "Duplicate" && status != "Rejected" )
{
string url = (string) feedInfo ["dataurl"];
// when searching, for example, for "cnews.com", Syndic8 returns
// results like "abcnews.com"
// check that the symbol before the found string is not an
// alphanumeric character
int pos = url.ToLower().IndexOf( _syndic8QueryString.ToLower() );
if ( pos > 0 && Char.IsLetterOrDigit( url, pos-1 ) )
{
continue;
}
int priority;
if ( (string) feedInfo ["scraped"] == "0" )
priority = 5;
else
priority = 4;
if ( HttpReader.IsSupportedProtocol( url ) )
{
_candidateURLs.Push( priority, url );
}
}
}
ParseNextCandidate();
}
*/
private void OnDiscoverProgress( string message )
{
if ( DiscoverProgress != null )
{
DiscoverProgress( this, new DownloadProgressEventArgs( message ) );
}
}
private void OnDiscoverDone()
{
if ( DiscoverDone != null )
{
DiscoverDone( this, EventArgs.Empty );
}
}
public class RSSDiscoverResults: IEnumerable
{
private ArrayList _results = new ArrayList();
public int Count
{
get { return _results.Count; }
}
internal void Add( RSSDiscoverResult result )
{
_results.Add( result );
}
public RSSDiscoverResult this[ int index ]
{
get { return (RSSDiscoverResult) _results [index]; }
}
public IEnumerator GetEnumerator()
{
return _results.GetEnumerator();
}
}
public class RSSDiscoverResult
{
private string _url;
private string _name;
private string _hintText;
private IResource _existingFeed;
internal RSSDiscoverResult( string url, string name, string hintText )
{
_url = url;
_name = name;
_hintText = hintText;
_existingFeed = RSSPlugin.GetExistingFeed( url );
}
public string URL { get { return _url; } }
public string Name { get { return _name; } }
public string HintText { get { return _hintText; } }
public IResource ExistingFeed { get { return _existingFeed; } }
public override string ToString()
{
if ( _hintText != null && _hintText.Length > 0 )
{
return _url + " (" + _hintText + ")";
}
return _url;
}
}
}
}