/// /// Copyright © 2003-2008 JetBrains s.r.o. /// You may distribute under the terms of the GNU General Public License, as published by the Free Software Foundation, version 2 (see License.txt in the repository root folder). /// using System; using System.Collections; using System.Diagnostics; using System.Text; using System.Text.RegularExpressions; using JetBrains.Omea.Base; namespace JetBrains.Omea.MailParser { /// /// Type of the parsed paragraph. /// public enum ParagraphType { /// /// Ordinary paragraph. /// Plain, /// /// Preformatted text, like in the <pre /> HTML tag. /// Fixed, /// /// Contains signature lines. /// Sig, /// /// Something quite special, for example, Outlook information. /// Service } /// /// Type of the plain-text paragraph. /// public enum PlainTextParaType { /// /// Normal paragraph, consists of several lines glued up into one paragraph. /// Plain, /// /// Preformatted text, like in the <pre /> HTML tag. /// Happens when the lines are too short to be considered as wrapped. /// Fixed, /// /// The lines are long enough to seem to be paragraphs not split into lines. Each line should be treated as a paragraph. /// Unwrapped } /// /// Parses the mail body into a list of paragraphs of different types (text, quote, sig). /// public class MailBodyParser { public class Paragraph { private readonly string _text; private readonly ParagraphType _type; private readonly int _quoteLevel; private readonly string _quotePrefix; private readonly bool _outlookQuote; /// /// Initializes the instance. /// internal Paragraph( string text, ParagraphType type, int quoteLevel, string quotePrefix, bool outlookQuote ) { _text = text; _type = type; _quoteLevel = quoteLevel; _quotePrefix = quotePrefix; _outlookQuote = outlookQuote; } public string Text { get { return _text; } } public ParagraphType Type { get { return _type; } } public int QuoteLevel { get { return _quoteLevel; } } public string QuotePrefix { get { return _quotePrefix; } } public bool OutlookQuote { get { return _outlookQuote; } } } private bool _foundOutlookQuote; private int _lastQuoteLevel = 0; private string _lastQuotePrefix = ""; private readonly int _minWrapWidth; private readonly ArrayList _paragraphs = new ArrayList(); private readonly MailBodyParser _origText; public MailBodyParser( string body, int minWrapWidth ) : this( body, minWrapWidth, null ) { } public MailBodyParser( string body, int minWrapWidth, MailBodyParser origText ) { _minWrapWidth = minWrapWidth; _origText = origText; if ( body != null ) { long startTicks = DateTime.Now.Ticks; ParseMailBody( body ); long endTicks = DateTime.Now.Ticks; Debug.WriteLine( "MailBodyParser parsing took " + (endTicks - startTicks) / 10000 + " ms" ); } } public int ParagraphCount { get { return _paragraphs.Count; } } public Paragraph GetParagraph( int index ) { return (Paragraph) _paragraphs[ index ]; } internal Paragraph FindParagraph( string part1, string part2 ) { string rxText = Regex.Escape( part1 ) + "\\s+" + Regex.Escape( part2 ); Regex rx = new Regex( rxText ); foreach( Paragraph para in _paragraphs ) { if ( rx.IsMatch( para.Text ) ) return para; } return null; } /** * Parses the body of the message and fills the paragraphs list. */ private void ParseMailBody( string body ) { body = body.Replace( "\r\n", "\n" ); string[] lines = body.Split( '\n' ); ArrayList curParaLines = new ArrayList(); int prevStartSpaces = -1; bool foundSig = false; bool textAfterSig = false; bool emptyLineAfterSig = false; _foundOutlookQuote = false; bool prevFirstLine = false; foreach( string line in lines ) { if ( foundSig ) { if ( line.Trim() == "" ) { if ( textAfterSig ) { emptyLineAfterSig = true; } AddPara( line, ParagraphType.Sig ); } else { if ( emptyLineAfterSig ) { foundSig = false; } else { textAfterSig = true; AddPara( line, ParagraphType.Sig ); } } } if ( !foundSig ) { if ( line.StartsWith( "-- " ) ) { AddTextPara( curParaLines, true ); curParaLines.Clear(); AddPara( line, ParagraphType.Sig ); foundSig = true; textAfterSig = false; emptyLineAfterSig = false; continue; } else if ( IsOutlookQuoteStart( line ) ) { AddTextPara( curParaLines, true ); curParaLines.Clear(); AddPara( line, ParagraphType.Service ); // the text after ----- Original message ----- is formatted as a quote // only if there was some significant text before it if( HaveNonquotedTextParagraphs() ) { _foundOutlookQuote = true; } continue; } int quoteLevel = GetQuoteLevel( line ); string strippedLine = ( quoteLevel > 0 ) ? StripQuoting( line ) : line; if ( strippedLine.Trim() == "" ) { AddTextPara( curParaLines, true ); curParaLines.Clear(); prevStartSpaces = -1; continue; } // check for broken quoting: // if the original message contains the last line of the last // (quoted) paragraph in the same paragraph as the current (unquoted) // line, it means that the current line is broken quoting and needs // to be appended to the last paragraph. if ( quoteLevel == 0 && _lastQuoteLevel == 1 && _origText != null && curParaLines.Count > 0 ) { string lastLine = (string) curParaLines [curParaLines.Count-1]; if ( _origText.FindParagraph( lastLine.Trim(), strippedLine.Trim() ) != null ) { string fixedLine = lastLine + " " + strippedLine.Trim(); curParaLines [curParaLines.Count-1] = fixedLine; continue; } } string quotePrefix = (quoteLevel > 0) ? GetQuotePrefix( line ) : ""; if ( quoteLevel != _lastQuoteLevel || quotePrefix != _lastQuotePrefix ) { AddTextPara( curParaLines, false ); curParaLines.Clear(); _lastQuoteLevel = quoteLevel; _lastQuotePrefix = quotePrefix; } int startSpaces = CountStartingSpaces( line ); // The condition below this line implements the following logic: // - any time the indent changes, we create a fixed paragraph, // - except for the case when the first line of a paragraph is // indented and the following lines are not if ( quoteLevel == 0 && ((startSpaces > 0 && prevStartSpaces >= 0 ) || (prevStartSpaces > 0 && !prevFirstLine) )) { AddFixedParas( curParaLines, prevFirstLine ); AddPara( line, ParagraphType.Fixed ); curParaLines.Clear(); } else { curParaLines.Add( strippedLine ); } prevFirstLine = ( prevStartSpaces == -1 ); prevStartSpaces = startSpaces; } } if ( curParaLines.Count > 0 ) AddTextPara( curParaLines, true ); } /** * Adds the paragraph with the specified type and text to the paragraph list. */ private void AddPara( string body, ParagraphType type ) { _paragraphs.Add( new Paragraph( body, type, 0, "", false ) ); } /** * Adds a pending paragraph (possibly quoted) to the paragraph list. */ private void AddTextPara( ArrayList lines, bool beforeEmptyLine ) { if ( lines.Count > 0 ) { PlainTextParaType paraType = IsPlainTextPara( lines ); if ( paraType == PlainTextParaType.Plain ) { StringBuilder bodyBuilder = StringBuilderPool.Alloc(); try { for( int i=0; i 0 && beforeEmptyLine ) { Paragraph oldPara = (Paragraph) _paragraphs [_paragraphs.Count-1]; if ( oldPara.Type == ParagraphType.Fixed ) { // insert a break paragraph after a sequence of fixed paragraphs _paragraphs.Add( new Paragraph( "", ParagraphType.Fixed, _lastQuoteLevel, _lastQuotePrefix, _foundOutlookQuote ) ); } } foreach( string line in lines ) { _paragraphs.Add( new Paragraph( line, ParagraphType.Fixed, _lastQuoteLevel, _lastQuotePrefix, _foundOutlookQuote ) ); } } /** * Determines whether the specified array of lines is a block of plain text * (which should be displayed with no line breaks) or of formatted text (which * should be displayed with line breaks. */ private PlainTextParaType IsPlainTextPara( ArrayList lines ) { if ( lines.Count <= 1 ) return PlainTextParaType.Plain; // If the same lines are present in a text to which we are replying, // and were plain text in the original message, they're still plain text now if ( _origText != null ) { Paragraph para = _origText.FindParagraph( (string) lines [0], (string) lines [1] ); if ( para != null && para.Type == ParagraphType.Plain ) return PlainTextParaType.Plain; } int minLineLength = Int32.MaxValue; int maxLineLength = 0; bool linesEndWithSpace = true; // we don't take the last line into account for( int i = 0; i < lines.Count-1; i++ ) { string line = (string) lines [i]; Debug.Assert( line.Length > 0 ); if ( line.Length < minLineLength ) minLineLength = line.Length; if ( line.Length > maxLineLength ) maxLineLength = line.Length; if ( line.Length > 0 && line [line.Length-1] != ' ' ) { linesEndWithSpace = false; } } if ( linesEndWithSpace ) return PlainTextParaType.Plain; // If all lines are smaller that some minimum value, show as separate lines if ( maxLineLength < _minWrapWidth ) return PlainTextParaType.Fixed; // If the lines are all long, this is probably a list of plain-text // paragraphs with no wrapping and no separator lines that needs to be broken // into separate line-long paragraphs int maxLineLength2 = Math.Max( maxLineLength, ((string) lines [lines.Count-1]).Length ); if ( maxLineLength2 > _minWrapWidth*2 ) { // maybe it's a table? bool hasSpaces = false; foreach( string line in lines ) { if ( line.IndexOf( " ") >= 0 || line.IndexOf( "\t" ) >= 0 ) { hasSpaces = true; break; } } if ( !hasSpaces ) return PlainTextParaType.Unwrapped; } /* * Try to autodetect if the text was word-wrapped. If wrapping was used, * then there is a certain margin, and the words are wrapped to the next line * because they exceed that margin. Thus, we add the first word of the next line * to the current line and see if these lengths (unwrapped lengths) for all lines * are greater than the actual wrapped length. * I know that the explanation is a bit unclear... */ int minWrappedLineLength = Int32.MaxValue; for ( int i = 0; i < lines.Count-1; i++ ) { string line = (string) lines [i]; string nextLine = (string) lines [i+1]; int wrappedLineLength = line.Length + 1 /* space */ + FirstWordLength( nextLine ); if ( wrappedLineLength < minWrappedLineLength ) minWrappedLineLength = wrappedLineLength; } if ( minWrappedLineLength > maxLineLength ) return PlainTextParaType.Plain; return PlainTextParaType.Fixed; } /** * Counts the starting spaces in the specified line. */ private static int CountStartingSpaces( string line ) { int cnt = 0; while( cnt < line.Length && Char.IsWhiteSpace( line, cnt ) ) { cnt++; } return cnt; } private static void ParseQuoting( string line, out int quoteLevel, out string quotePrefix, out string quotedText ) { int spaces = CountStartingSpaces( line ); quoteLevel = 0; StringBuilder quotePrefixBuilder = StringBuilderPool.Alloc(); try { int pos = spaces; bool foundWhitespace = false; while( pos < line.Length ) { if ( line [pos] == '>' ) { quoteLevel++; } else if ( Char.IsLetter ( line, pos ) ) { // the letters before the first > character are the quote prefix // any other letter stops the quoting if ( quoteLevel > 0 || foundWhitespace ) break; quotePrefixBuilder.Append( line [pos] ); } else if ( !Char.IsWhiteSpace( line, pos ) ) break; else foundWhitespace = true; pos++; } if ( quoteLevel > 0 ) { quotePrefix = quotePrefixBuilder.ToString(); quotedText = line.Substring( pos ); } else { quotePrefix = ""; quotedText = line; } } finally { StringBuilderPool.Dispose( quotePrefixBuilder ); } } /** * Returns the quoting level (count of > characters) for the specified line. */ public static int GetQuoteLevel( string line ) { int quoteLevel; string quotePrefix, quotedText; ParseQuoting( line, out quoteLevel, out quotePrefix, out quotedText ); return quoteLevel; } /** * Strips the quote prefix from the specified line. */ public static string StripQuoting( string line ) { int quoteLevel; string quotePrefix, quotedText; ParseQuoting( line, out quoteLevel, out quotePrefix, out quotedText ); return quotedText; } /** * Returns the quote prefix (the characters before the > character) for the specified line. */ public static string GetQuotePrefix( string line ) { int quoteLevel; string quotePrefix, quotedText; ParseQuoting( line, out quoteLevel, out quotePrefix, out quotedText ); return quotePrefix; } /** * Returns the length of the first word in a line. */ private static int FirstWordLength( string line ) { int startPos = 0; while( startPos < line.Length && Char.IsWhiteSpace( line, startPos ) ) startPos++; if ( startPos == line.Length ) return 0; int endPos = startPos; while( endPos < line.Length && !Char.IsWhiteSpace( line, endPos ) ) endPos++; return endPos - startPos; } /** * Checks if the specified line is the Outlook "original message" line. */ private static bool IsOutlookQuoteStart( string line ) { line = line.Trim(); if ( line.Length < 15) // 5 dashes at start, 5 dashes at end and something in between return false; int pos = 0; int startDashes = 0; while (pos < line.Length && line [pos] == '-' ) { pos++; startDashes++; } if ( startDashes != 5 ) return false; pos = line.Length - 1; int endDashes = 0; while (pos >= 0 && line [pos] == '-') { pos--; endDashes++; } return (endDashes == 5); } /** * Checks if non-quoted text paragraphs have already peen parsed. */ private bool HaveNonquotedTextParagraphs() { foreach( Paragraph para in _paragraphs ) { if ( (para.Type == ParagraphType.Plain || para.Type == ParagraphType.Fixed ) && para.QuoteLevel == 0 ) { return true; } } return false; } } }