主要是用来提取html页面内容时使用。
示例代码
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; using System.Net; using System.Net.NetworkInformation; using System.Net.Sockets; using System.Threading; using System.Text.RegularExpressions; namespace HtmlRegex { public class BaseRegex { WebClient web = new WebClient(); public void DeBug(string path,int encoding,string content) { Encoding encods; if (encoding == 1) encods = Encoding.UTF8; else encods = Encoding.Default; StreamWriter sw = new StreamWriter(path,true ,encods); sw.WriteLine(content); sw.Flush(); sw.Close(); } public string getPageContent(string url, int encoding) { byte[] buff = web.DownloadData(url); if (encoding == 1) { return Encoding.UTF8.GetString(buff); } return Encoding.Default.GetString(buff); } public string checkHtml(string html) { //过滤JS和CSS Regex regex1 = new Regex(@"\<script.*?\>.+?\</script\>", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex2 = new Regex(@"\<style.*?\>.+?\</style\>", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex3 = new Regex(@"\<script.*?\>.*?\</script\>", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex4 = new Regex(@"\<style.*?\>.*?\</style\>", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex5 = new Regex(@"\<.*?\>", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex6 = new Regex(@"&\S{2,}?;", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex7 = new Regex(@"\<!\-\-.+?\-\-\>", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex8 = new Regex(@"[\r\n]{2,}", RegexOptions.Singleline | RegexOptions.IgnoreCase); //HTML标签包括自闭和标签 //Regex regex9 = new Regex(@"<(.*)(.*)>.*<\/\1>|<(.*) \/>", RegexOptions.Singleline | RegexOptions.IgnoreCase); html = regex1.Replace(html, ""); html = regex2.Replace(html, ""); html = regex3.Replace(html, ""); html = regex4.Replace(html, ""); html = regex5.Replace(html, ""); html = regex6.Replace(html, ""); html = regex7.Replace(html, ""); html = regex8.Replace(html, ""); html = html.Replace(" ", ""); return html; } } }
原文链接:https://www.cnblogs.com/shya/p/2439443.html
本文来源 爱码网,其版权均为 原网址 所有 与本站无关,文章内容系作者个人观点,不代表 本站 对观点赞同或支持。如需转载,请注明文章来源。
© 版权声明
文章版权归作者所有,未经允许请勿转载。