正则表达式过滤HTML、JS、CSS

HTML文章/教程1年前 (2023)发布勤奋小助

0 0

主要是用来提取html页面内容时使用。

示例代码

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Net;
using System.Net.NetworkInformation;
using System.Net.Sockets;
using System.Threading;
using System.Text.RegularExpressions;
namespace HtmlRegex
{
    public class BaseRegex
    {
        WebClient web = new WebClient();
        public void DeBug(string path,int encoding,string content)
        {
            Encoding encods;
            if (encoding == 1)
                encods = Encoding.UTF8;
            else
                encods = Encoding.Default;
            StreamWriter sw = new StreamWriter(path,true ,encods);
            sw.WriteLine(content);
            sw.Flush();
            sw.Close();
        }
        public string getPageContent(string url, int encoding)
        {
            byte[] buff = web.DownloadData(url);
            if (encoding == 1)
            {
                return Encoding.UTF8.GetString(buff);
            }
            return Encoding.Default.GetString(buff);
        }
        public string checkHtml(string html)
        {
            //过滤JS和CSS
            Regex regex1 = new Regex(@"\<script.*?\>.+?\</script\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex2 = new Regex(@"\<style.*?\>.+?\</style\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex3 = new Regex(@"\<script.*?\>.*?\</script\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex4 = new Regex(@"\<style.*?\>.*?\</style\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            
            Regex regex5 = new Regex(@"\<.*?\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex6 = new Regex(@"&\S{2,}?;", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex7 = new Regex(@"\<!\-\-.+?\-\-\>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Regex regex8 = new Regex(@"[\r\n]{2,}", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            //HTML标签包括自闭和标签
            //Regex regex9 = new Regex(@"<(.*)(.*)>.*<\/\1>|<(.*) \/>", RegexOptions.Singleline | RegexOptions.IgnoreCase);

            html = regex1.Replace(html, "");
            html = regex2.Replace(html, "");
            html = regex3.Replace(html, "");
            html = regex4.Replace(html, "");
            html = regex5.Replace(html, "");
            html = regex6.Replace(html, "");
            html = regex7.Replace(html, "");
            html = regex8.Replace(html, "");
            html = html.Replace(" ", "");
            return html;
        }
    }
}