Home  >  Article  >  Backend Development  >  C# implements method of converting HTML into plain text

C# implements method of converting HTML into plain text

高洛峰
高洛峰Original
2017-01-14 13:19:411994browse

The example in this article describes the method of converting HTML into plain text in C#. Share it with everyone for your reference. details as follows:

Instructions:

HtmlToText convert = new HtmlToText();
textBox2.Text = convert.Convert(textBox1.Text);

The C# code is as follows:

/// <summary>
/// Converts HTML to plain text.
/// </summary>
class HtmlToText
{
  // Static data tables
  protected static Dictionary<string, string> _tags;
  protected static HashSet<string> _ignoreTags;
  // Instance variables
  protected TextBuilder _text;
  protected string _html;
  protected int _pos;
  // Static constructor (one time only)
  static HtmlToText()
  {
    _tags = new Dictionary<string, string>();
    _tags.Add("address", "\n");
    _tags.Add("blockquote", "\n");
    _tags.Add("div", "\n");
    _tags.Add("dl", "\n");
    _tags.Add("fieldset", "\n");
    _tags.Add("form", "\n");
    _tags.Add("h1", "\n");
    _tags.Add("/h1", "\n");
    _tags.Add("h2", "\n");
    _tags.Add("/h2", "\n");
    _tags.Add("h3", "\n");
    _tags.Add("/h3", "\n");
    _tags.Add("h4", "\n");
    _tags.Add("/h4", "\n");
    _tags.Add("h5", "\n");
    _tags.Add("/h5", "\n");
    _tags.Add("h6", "\n");
    _tags.Add("/h6", "\n");
    _tags.Add("p", "\n");
    _tags.Add("/p", "\n");
    _tags.Add("table", "\n");
    _tags.Add("/table", "\n");
    _tags.Add("ul", "\n");
    _tags.Add("/ul", "\n");
    _tags.Add("ol", "\n");
    _tags.Add("/ol", "\n");
    _tags.Add("/li", "\n");
    _tags.Add("br", "\n");
    _tags.Add("/td", "\t");
    _tags.Add("/tr", "\n");
    _tags.Add("/pre", "\n");
    _ignoreTags = new HashSet<string>();
    _ignoreTags.Add("script");
    _ignoreTags.Add("noscript");
    _ignoreTags.Add("style");
    _ignoreTags.Add("object");
  }
  /// <summary>
  /// Converts the given HTML to plain text and returns the result.
  /// </summary>
  /// <param name="html">HTML to be converted</param>
  /// <returns>Resulting plain text</returns>
  public string Convert(string html)
  {
    // Initialize state variables
    _text = new TextBuilder();
    _html = html;
    _pos = 0;
    // Process input
    while (!EndOfText)
    {
      if (Peek() == '<')
      {
        // HTML tag
        bool selfClosing;
        string tag = ParseTag(out selfClosing);
        // Handle special tag cases
        if (tag == "body")
        {
          // Discard content before <body>
          _text.Clear();
        }
        else if (tag == "/body")
        {
          // Discard content after </body>
          _pos = _html.Length;
        }
        else if (tag == "pre")
        {
          // Enter preformatted mode
          _text.Preformatted = true;
          EatWhitespaceToNextLine();
        }
        else if (tag == "/pre")
        {
          // Exit preformatted mode
          _text.Preformatted = false;
        }
        string value;
        if (_tags.TryGetValue(tag, out value))
          _text.Write(value);
        if (_ignoreTags.Contains(tag))
          EatInnerContent(tag);
      }
      else if (Char.IsWhiteSpace(Peek()))
      {
        // Whitespace (treat all as space)
        _text.Write(_text.Preformatted ? Peek() : ' ');
        MoveAhead();
      }
      else
      {
        // Other text
        _text.Write(Peek());
        MoveAhead();
      }
    }
    // Return result
    return HttpUtility.HtmlDecode(_text.ToString());
  }
  // Eats all characters that are part of the current tag
  // and returns information about that tag
  protected string ParseTag(out bool selfClosing)
  {
    string tag = String.Empty;
    selfClosing = false;
    if (Peek() == '<')
    {
      MoveAhead();
      // Parse tag name
      EatWhitespace();
      int start = _pos;
      if (Peek() == '/')
        MoveAhead();
      while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&
        Peek() != '/' && Peek() != '>')
        MoveAhead();
      tag = _html.Substring(start, _pos - start).ToLower();
      // Parse rest of tag
      while (!EndOfText && Peek() != '>')
      {
        if (Peek() == '"' || Peek() == '\'')
          EatQuotedValue();
        else
        {
          if (Peek() == '/')
            selfClosing = true;
          MoveAhead();
        }
      }
      MoveAhead();
    }
    return tag;
  }
  // Consumes inner content from the current tag
  protected void EatInnerContent(string tag)
  {
    string endTag = "/" + tag;
    while (!EndOfText)
    {
      if (Peek() == '<')
      {
        // Consume a tag
        bool selfClosing;
        if (ParseTag(out selfClosing) == endTag)
          return;
        // Use recursion to consume nested tags
        if (!selfClosing && !tag.StartsWith("/"))
          EatInnerContent(tag);
      }
      else MoveAhead();
    }
  }
  // Returns true if the current position is at the end of
  // the string
  protected bool EndOfText
  {
    get { return (_pos >= _html.Length); }
  }
  // Safely returns the character at the current position
  protected char Peek()
  {
    return (_pos < _html.Length) ? _html[_pos] : (char)0;
  }
  // Safely advances to current position to the next character
  protected void MoveAhead()
  {
    _pos = Math.Min(_pos + 1, _html.Length);
  }
  // Moves the current position to the next non-whitespace
  // character.
  protected void EatWhitespace()
  {
    while (Char.IsWhiteSpace(Peek()))
      MoveAhead();
  }
  // Moves the current position to the next non-whitespace
  // character or the start of the next line, whichever
  // comes first
  protected void EatWhitespaceToNextLine()
  {
    while (Char.IsWhiteSpace(Peek()))
    {
      char c = Peek();
      MoveAhead();
      if (c == '\n')
        break;
    }
  }
  // Moves the current position past a quoted value
  protected void EatQuotedValue()
  {
    char c = Peek();
    if (c == '"' || c == '\'')
    {
      // Opening quote
      MoveAhead();
      // Find end of value
      int start = _pos;
      _pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos);
      if (_pos < 0)
        _pos = _html.Length;
      else
        MoveAhead();  // Closing quote
    }
  }
  /// <summary>
  /// A StringBuilder class that helps eliminate excess whitespace.
  /// </summary>
  protected class TextBuilder
  {
    private StringBuilder _text;
    private StringBuilder _currLine;
    private int _emptyLines;
    private bool _preformatted;
    // Construction
    public TextBuilder()
    {
      _text = new StringBuilder();
      _currLine = new StringBuilder();
      _emptyLines = 0;
      _preformatted = false;
    }
    /// <summary>
    /// Normally, extra whitespace characters are discarded.
    /// If this property is set to true, they are passed
    /// through unchanged.
    /// </summary>
    public bool Preformatted
    {
      get
      {
        return _preformatted;
      }
      set
      {
        if (value)
        {
          // Clear line buffer if changing to
          // preformatted mode
          if (_currLine.Length > 0)
            FlushCurrLine();
          _emptyLines = 0;
        }
        _preformatted = value;
      }
    }
    /// <summary>
    /// Clears all current text.
    /// </summary>
    public void Clear()
    {
      _text.Length = 0;
      _currLine.Length = 0;
      _emptyLines = 0;
    }
    /// <summary>
    /// Writes the given string to the output buffer.
    /// </summary>
    /// <param name="s"></param>
    public void Write(string s)
    {
      foreach (char c in s)
        Write(c);
    }
    /// <summary>
    /// Writes the given character to the output buffer.
    /// </summary>
    /// <param name="c">Character to write</param>
    public void Write(char c)
    {
      if (_preformatted)
      {
        // Write preformatted character
        _text.Append(c);
      }
      else
      {
        if (c == '\r')
        {
          // Ignore carriage returns. We'll process
          // '\n' if it comes next
        }
        else if (c == '\n')
        {
          // Flush current line
          FlushCurrLine();
        }
        else if (Char.IsWhiteSpace(c))
        {
          // Write single space character
          int len = _currLine.Length;
          if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))
            _currLine.Append(' ');
        }
        else
        {
          // Add character to current line
          _currLine.Append(c);
        }
      }
    }
    // Appends the current line to output buffer
    protected void FlushCurrLine()
    {
      // Get current line
      string line = _currLine.ToString().Trim();
      // Determine if line contains non-space characters
      string tmp = line.Replace(" ", String.Empty);
      if (tmp.Length == 0)
      {
        // An empty line
        _emptyLines++;
        if (_emptyLines < 2 && _text.Length > 0)
          _text.AppendLine(line);
      }
      else
      {
        // A non-empty line
        _emptyLines = 0;
        _text.AppendLine(line);
      }
      // Reset current line
      _currLine.Length = 0;
    }
    /// <summary>
    /// Returns the current output as a string.
    /// </summary>
    public override string ToString()
    {
      if (_currLine.Length > 0)
        FlushCurrLine();
      return _text.ToString();
    }
  }
}

I hope this article will be helpful to everyone’s C# programming.

For more C# methods to convert HTML into plain text, please pay attention to the PHP Chinese website!


Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn