search
HomeBackend DevelopmentC#.Net TutorialC# Convert HTML to plain text

/// <summary>
/// Converts HTML to plain text.
/// </summary>
class HtmlToText
{
    // Static data tables
    protected static Dictionary<string, string> _tags;
    protected static HashSet<string> _ignoreTags;
 
    // Instance variables
    protected TextBuilder _text;
    protected string _html;
    protected int _pos;
 
    // Static constructor (one time only)
    static HtmlToText()
    {
        _tags = new Dictionary<string, string>();
        _tags.Add("address", "\n");
        _tags.Add("blockquote", "\n");
        _tags.Add("div", "\n");
        _tags.Add("dl", "\n");
        _tags.Add("fieldset", "\n");
        _tags.Add("form", "\n");
        _tags.Add("h1", "\n");
        _tags.Add("/h1", "\n");
        _tags.Add("h2", "\n");
        _tags.Add("/h2", "\n");
        _tags.Add("h3", "\n");
        _tags.Add("/h3", "\n");
        _tags.Add("h4", "\n");
        _tags.Add("/h4", "\n");
        _tags.Add("h5", "\n");
        _tags.Add("/h5", "\n");
        _tags.Add("h6", "\n");
        _tags.Add("/h6", "\n");
        _tags.Add("p", "\n");
        _tags.Add("/p", "\n");
        _tags.Add("table", "\n");
        _tags.Add("/table", "\n");
        _tags.Add("ul", "\n");
        _tags.Add("/ul", "\n");
        _tags.Add("ol", "\n");
        _tags.Add("/ol", "\n");
        _tags.Add("/li", "\n");
        _tags.Add("br", "\n");
        _tags.Add("/td", "\t");
        _tags.Add("/tr", "\n");
        _tags.Add("/pre", "\n");
 
        _ignoreTags = new HashSet<string>();
        _ignoreTags.Add("script");
        _ignoreTags.Add("noscript");
        _ignoreTags.Add("style");
        _ignoreTags.Add("object");
    }
 
    /// <summary>
    /// Converts the given HTML to plain text and returns the result.
    /// </summary>
    /// <param name="html">HTML to be converted</param>
    /// <returns>Resulting plain text</returns>
    public string Convert(string html)
    {
        // Initialize state variables
        _text = new TextBuilder();
        _html = html;
        _pos = 0;
 
        // Process input
        while (!EndOfText)
        {
            if (Peek() == &#39;<&#39;)
            {
                // HTML tag
                bool selfClosing;
                string tag = ParseTag(out selfClosing);
 
                // Handle special tag cases
                if (tag == "body")
                {
                    // Discard content before <body>
                    _text.Clear();
                }
                else if (tag == "/body")
                {
                    // Discard content after </body>
                    _pos = _html.Length;
                }
                else if (tag == "pre")
                {
                    // Enter preformatted mode
                    _text.Preformatted = true;
                    EatWhitespaceToNextLine();
                }
                else if (tag == "/pre")
                {
                    // Exit preformatted mode
                    _text.Preformatted = false;
                }
 
                string value;
                if (_tags.TryGetValue(tag, out value))
                    _text.Write(value);
 
                if (_ignoreTags.Contains(tag))
                    EatInnerContent(tag);
            }
            else if (Char.IsWhiteSpace(Peek()))
            {
                // Whitespace (treat all as space)
                _text.Write(_text.Preformatted ? Peek() : &#39; &#39;);
                MoveAhead();
            }
            else
            {
                // Other text
                _text.Write(Peek());
                MoveAhead();
            }
        }
        // Return result
        return HttpUtility.HtmlDecode(_text.ToString());
    }
 
    // Eats all characters that are part of the current tag
    // and returns information about that tag
    protected string ParseTag(out bool selfClosing)
    {
        string tag = String.Empty;
        selfClosing = false;
 
        if (Peek() == &#39;<&#39;)
        {
            MoveAhead();
 
            // Parse tag name
            EatWhitespace();
            int start = _pos;
            if (Peek() == &#39;/&#39;)
                MoveAhead();
            while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&
                Peek() != &#39;/&#39; && Peek() != &#39;>&#39;)
                MoveAhead();
            tag = _html.Substring(start, _pos - start).ToLower();
 
            // Parse rest of tag
            while (!EndOfText && Peek() != &#39;>&#39;)
            {
                if (Peek() == &#39;"&#39; || Peek() == &#39;\&#39;&#39;)
                    EatQuotedValue();
                else
                {
                    if (Peek() == &#39;/&#39;)
                        selfClosing = true;
                    MoveAhead();
                }
            }
            MoveAhead();
        }
        return tag;
    }
 
    // Consumes inner content from the current tag
    protected void EatInnerContent(string tag)
    {
        string endTag = "/" + tag;
 
        while (!EndOfText)
        {
            if (Peek() == &#39;<&#39;)
            {
                // Consume a tag
                bool selfClosing;
                if (ParseTag(out selfClosing) == endTag)
                    return;
                // Use recursion to consume nested tags
                if (!selfClosing && !tag.StartsWith("/"))
                    EatInnerContent(tag);
            }
            else MoveAhead();
        }
    }
 
    // Returns true if the current position is at the end of
    // the string
    protected bool EndOfText
    {
        get { return (_pos >= _html.Length); }
    }
 
    // Safely returns the character at the current position
    protected char Peek()
    {
        return (_pos < _html.Length) ? _html[_pos] : (char)0;
    }
 
    // Safely advances to current position to the next character
    protected void MoveAhead()
    {
        _pos = Math.Min(_pos + 1, _html.Length);
    }
 
    // Moves the current position to the next non-whitespace
    // character.
    protected void EatWhitespace()
    {
        while (Char.IsWhiteSpace(Peek()))
            MoveAhead();
    }
 
    // Moves the current position to the next non-whitespace
    // character or the start of the next line, whichever
    // comes first
    protected void EatWhitespaceToNextLine()
    {
        while (Char.IsWhiteSpace(Peek()))
        {
            char c = Peek();
            MoveAhead();
            if (c == &#39;\n&#39;)
                break;
        }
    }
 
    // Moves the current position past a quoted value
    protected void EatQuotedValue()
    {
        char c = Peek();
        if (c == &#39;"&#39; || c == &#39;\&#39;&#39;)
        {
            // Opening quote
            MoveAhead();
            // Find end of value
            int start = _pos;
            _pos = _html.IndexOfAny(new char[] { c, &#39;\r&#39;, &#39;\n&#39; }, _pos);
            if (_pos < 0)
                _pos = _html.Length;
            else
                MoveAhead();    // Closing quote
        }
    }
 
    /// <summary>
    /// A StringBuilder class that helps eliminate excess whitespace.
    /// </summary>
    protected class TextBuilder
    {
        private StringBuilder _text;
        private StringBuilder _currLine;
        private int _emptyLines;
        private bool _preformatted;
 
        // Construction
        public TextBuilder()
        {
            _text = new StringBuilder();
            _currLine = new StringBuilder();
            _emptyLines = 0;
            _preformatted = false;
        }
 
        /// <summary>
        /// Normally, extra whitespace characters are discarded.
        /// If this property is set to true, they are passed
        /// through unchanged.
        /// </summary>
        public bool Preformatted
        {
            get
            {
                return _preformatted;
            }
            set
            {
                if (value)
                {
                    // Clear line buffer if changing to
                    // preformatted mode
                    if (_currLine.Length > 0)
                        FlushCurrLine();
                    _emptyLines = 0;
                }
                _preformatted = value;
            }
        }
 
        /// <summary>
        /// Clears all current text.
        /// </summary>
        public void Clear()
        {
            _text.Length = 0;
            _currLine.Length = 0;
            _emptyLines = 0;
        }
 
        /// <summary>
        /// Writes the given string to the output buffer.
        /// </summary>
        /// <param name="s"></param>
        public void Write(string s)
        {
            foreach (char c in s)
                Write(c);
        }
 
        /// <summary>
        /// Writes the given character to the output buffer.
        /// </summary>
        /// <param name="c">Character to write</param>
        public void Write(char c)
        {
            if (_preformatted)
            {
                // Write preformatted character
                _text.Append(c);
            }
            else
            {
                if (c == &#39;\r&#39;)
                {
                    // Ignore carriage returns. We&#39;ll process
                    // &#39;\n&#39; if it comes next
                }
                else if (c == &#39;\n&#39;)
                {
                    // Flush current line
                    FlushCurrLine();
                }
                else if (Char.IsWhiteSpace(c))
                {
                    // Write single space character
                    int len = _currLine.Length;
                    if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))
                        _currLine.Append(&#39; &#39;);
                }
                else
                {
                    // Add character to current line
                    _currLine.Append(c);
                }
            }
        }
 
        // Appends the current line to output buffer
        protected void FlushCurrLine()
        {
            // Get current line
            string line = _currLine.ToString().Trim();
 
            // Determine if line contains non-space characters
            string tmp = line.Replace(" ", String.Empty);
            if (tmp.Length == 0)
            {
                // An empty line
                _emptyLines++;
                if (_emptyLines < 2 && _text.Length > 0)
                    _text.AppendLine(line);
            }
            else
            {
                // A non-empty line
                _emptyLines = 0;
                _text.AppendLine(line);
            }
 
            // Reset current line
            _currLine.Length = 0;
        }
 
        /// <summary>
        /// Returns the current output as a string.
        /// </summary>
        public override string ToString()
        {
            if (_currLine.Length > 0)
                FlushCurrLine();
            return _text.ToString();
        }
    }
}

Statement
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
C# .NET Development: A Beginner's Guide to Getting StartedC# .NET Development: A Beginner's Guide to Getting StartedApr 18, 2025 am 12:17 AM

To start C#.NET development, you need to: 1. Understand the basic knowledge of C# and the core concepts of the .NET framework; 2. Master the basic concepts of variables, data types, control structures, functions and classes; 3. Learn advanced features of C#, such as LINQ and asynchronous programming; 4. Be familiar with debugging techniques and performance optimization methods for common errors. With these steps, you can gradually penetrate the world of C#.NET and write efficient applications.

C# and .NET: Understanding the Relationship Between the TwoC# and .NET: Understanding the Relationship Between the TwoApr 17, 2025 am 12:07 AM

The relationship between C# and .NET is inseparable, but they are not the same thing. C# is a programming language, while .NET is a development platform. C# is used to write code, compile into .NET's intermediate language (IL), and executed by the .NET runtime (CLR).

The Continued Relevance of C# .NET: A Look at Current UsageThe Continued Relevance of C# .NET: A Look at Current UsageApr 16, 2025 am 12:07 AM

C#.NET is still important because it provides powerful tools and libraries that support multiple application development. 1) C# combines .NET framework to make development efficient and convenient. 2) C#'s type safety and garbage collection mechanism enhance its advantages. 3) .NET provides a cross-platform running environment and rich APIs, improving development flexibility.

From Web to Desktop: The Versatility of C# .NETFrom Web to Desktop: The Versatility of C# .NETApr 15, 2025 am 12:07 AM

C#.NETisversatileforbothwebanddesktopdevelopment.1)Forweb,useASP.NETfordynamicapplications.2)Fordesktop,employWindowsFormsorWPFforrichinterfaces.3)UseXamarinforcross-platformdevelopment,enablingcodesharingacrossWindows,macOS,Linux,andmobiledevices.

C# .NET and the Future: Adapting to New TechnologiesC# .NET and the Future: Adapting to New TechnologiesApr 14, 2025 am 12:06 AM

C# and .NET adapt to the needs of emerging technologies through continuous updates and optimizations. 1) C# 9.0 and .NET5 introduce record type and performance optimization. 2) .NETCore enhances cloud native and containerized support. 3) ASP.NETCore integrates with modern web technologies. 4) ML.NET supports machine learning and artificial intelligence. 5) Asynchronous programming and best practices improve performance.

Is C# .NET Right for You? Evaluating its ApplicabilityIs C# .NET Right for You? Evaluating its ApplicabilityApr 13, 2025 am 12:03 AM

C#.NETissuitableforenterprise-levelapplicationswithintheMicrosoftecosystemduetoitsstrongtyping,richlibraries,androbustperformance.However,itmaynotbeidealforcross-platformdevelopmentorwhenrawspeediscritical,wherelanguageslikeRustorGomightbepreferable.

C# Code within .NET: Exploring the Programming ProcessC# Code within .NET: Exploring the Programming ProcessApr 12, 2025 am 12:02 AM

The programming process of C# in .NET includes the following steps: 1) writing C# code, 2) compiling into an intermediate language (IL), and 3) executing by the .NET runtime (CLR). The advantages of C# in .NET are its modern syntax, powerful type system and tight integration with the .NET framework, suitable for various development scenarios from desktop applications to web services.

C# .NET: Exploring Core Concepts and Programming FundamentalsC# .NET: Exploring Core Concepts and Programming FundamentalsApr 10, 2025 am 09:32 AM

C# is a modern, object-oriented programming language developed by Microsoft and as part of the .NET framework. 1.C# supports object-oriented programming (OOP), including encapsulation, inheritance and polymorphism. 2. Asynchronous programming in C# is implemented through async and await keywords to improve application responsiveness. 3. Use LINQ to process data collections concisely. 4. Common errors include null reference exceptions and index out-of-range exceptions. Debugging skills include using a debugger and exception handling. 5. Performance optimization includes using StringBuilder and avoiding unnecessary packing and unboxing.

See all articles

Hot AI Tools

Undresser.AI Undress

Undresser.AI Undress

AI-powered app for creating realistic nude photos

AI Clothes Remover

AI Clothes Remover

Online AI tool for removing clothes from photos.

Undress AI Tool

Undress AI Tool

Undress images for free

Clothoff.io

Clothoff.io

AI clothes remover

AI Hentai Generator

AI Hentai Generator

Generate AI Hentai for free.

Hot Article

R.E.P.O. Energy Crystals Explained and What They Do (Yellow Crystal)
1 months agoBy尊渡假赌尊渡假赌尊渡假赌
R.E.P.O. Best Graphic Settings
1 months agoBy尊渡假赌尊渡假赌尊渡假赌
Will R.E.P.O. Have Crossplay?
1 months agoBy尊渡假赌尊渡假赌尊渡假赌

Hot Tools

MinGW - Minimalist GNU for Windows

MinGW - Minimalist GNU for Windows

This project is in the process of being migrated to osdn.net/projects/mingw, you can continue to follow us there. MinGW: A native Windows port of the GNU Compiler Collection (GCC), freely distributable import libraries and header files for building native Windows applications; includes extensions to the MSVC runtime to support C99 functionality. All MinGW software can run on 64-bit Windows platforms.

Notepad++7.3.1

Notepad++7.3.1

Easy-to-use and free code editor

WebStorm Mac version

WebStorm Mac version

Useful JavaScript development tools

Dreamweaver Mac version

Dreamweaver Mac version

Visual web development tools

SublimeText3 Mac version

SublimeText3 Mac version

God-level code editing software (SublimeText3)