152 lines
4.1 KiB
C#
152 lines
4.1 KiB
C#
|
using CsharpHttpHelper.Enum;
|
|||
|
using CsharpHttpHelper.Item;
|
|||
|
using CsharpHttpHelper.Static;
|
|||
|
using System.Collections.Generic;
|
|||
|
using System.Text.RegularExpressions;
|
|||
|
|
|||
|
namespace CsharpHttpHelper.Helper
|
|||
|
{
|
|||
|
/// <summary>
|
|||
|
/// Html操作相关 Copyright:http://www.httphelper.com/
|
|||
|
/// </summary>
|
|||
|
internal class HtmlHelper
|
|||
|
{
|
|||
|
/// <summary>
|
|||
|
/// 获取所有的A链接
|
|||
|
/// </summary>
|
|||
|
/// <param name="html">要分析的Html代码</param>
|
|||
|
/// <returns>返回一个List存储所有的A标签</returns>
|
|||
|
internal static List<AItem> GetAList(string html)
|
|||
|
{
|
|||
|
List<AItem> list = null;
|
|||
|
string alist = RegexString.Alist;
|
|||
|
if (Regex.IsMatch(html, alist, RegexOptions.IgnoreCase))
|
|||
|
{
|
|||
|
list = new List<AItem>();
|
|||
|
foreach (Match item in Regex.Matches(html, alist, RegexOptions.IgnoreCase))
|
|||
|
{
|
|||
|
AItem aItem = null;
|
|||
|
try
|
|||
|
{
|
|||
|
AItem aItem2 = new AItem();
|
|||
|
aItem2.Href = item.Groups[1].Value;
|
|||
|
aItem2.Text = item.Groups[2].Value;
|
|||
|
aItem2.Html = item.Value;
|
|||
|
aItem2.Type = AType.Text;
|
|||
|
aItem = aItem2;
|
|||
|
List<ImgItem> imgList = GetImgList(aItem.Text);
|
|||
|
if (imgList != null && imgList.Count > 0)
|
|||
|
{
|
|||
|
aItem.Type = AType.Img;
|
|||
|
aItem.Img = imgList[0];
|
|||
|
}
|
|||
|
}
|
|||
|
catch
|
|||
|
{
|
|||
|
aItem = null;
|
|||
|
}
|
|||
|
if (aItem != null)
|
|||
|
{
|
|||
|
list.Add(aItem);
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
return list;
|
|||
|
}
|
|||
|
|
|||
|
/// <summary>
|
|||
|
/// 获取所有的Img标签
|
|||
|
/// </summary>
|
|||
|
/// <param name="html">要分析的Html代码</param>
|
|||
|
/// <returns>返回一个List存储所有的Img标签</returns>
|
|||
|
internal static List<ImgItem> GetImgList(string html)
|
|||
|
{
|
|||
|
List<ImgItem> list = null;
|
|||
|
string imgList = RegexString.ImgList;
|
|||
|
if (Regex.IsMatch(html, imgList, RegexOptions.IgnoreCase))
|
|||
|
{
|
|||
|
list = new List<ImgItem>();
|
|||
|
foreach (Match item in Regex.Matches(html, imgList, RegexOptions.IgnoreCase))
|
|||
|
{
|
|||
|
ImgItem imgItem = null;
|
|||
|
try
|
|||
|
{
|
|||
|
ImgItem imgItem2 = new ImgItem();
|
|||
|
imgItem2.Src = item.Groups[1].Value;
|
|||
|
imgItem2.Html = item.Value;
|
|||
|
imgItem = imgItem2;
|
|||
|
}
|
|||
|
catch
|
|||
|
{
|
|||
|
imgItem = null;
|
|||
|
}
|
|||
|
if (imgItem != null)
|
|||
|
{
|
|||
|
list.Add(imgItem);
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
return list;
|
|||
|
}
|
|||
|
|
|||
|
/// <summary>
|
|||
|
/// 过滤html标签
|
|||
|
/// </summary>
|
|||
|
/// <param name="html">html的内容</param>
|
|||
|
/// <returns>处理后的文本</returns>
|
|||
|
internal static string StripHTML(string html)
|
|||
|
{
|
|||
|
html = Regex.Replace(html, RegexString.Nscript, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|||
|
html = Regex.Replace(html, RegexString.Style, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|||
|
html = Regex.Replace(html, RegexString.Script, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|||
|
html = Regex.Replace(html, RegexString.Html, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|||
|
return html;
|
|||
|
}
|
|||
|
|
|||
|
/// <summary>
|
|||
|
/// 过滤html中所有的换行符号
|
|||
|
/// </summary>
|
|||
|
/// <param name="html">html的内容</param>
|
|||
|
/// <returns>处理后的文本</returns>
|
|||
|
internal static string ReplaceNewLine(string html)
|
|||
|
{
|
|||
|
return Regex.Replace(html, RegexString.NewLine, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|||
|
}
|
|||
|
|
|||
|
/// <summary>
|
|||
|
/// 提取Html字符串中两字符之间的数据
|
|||
|
/// </summary>
|
|||
|
/// <param name="html">源Html</param>
|
|||
|
/// <param name="s">开始字符串</param>
|
|||
|
/// <param name="e">结束字符串</param>
|
|||
|
/// <returns></returns>
|
|||
|
internal static string GetBetweenHtml(string html, string s, string e)
|
|||
|
{
|
|||
|
string pattern = $"{s}{RegexString.AllHtml}{e}";
|
|||
|
if (Regex.IsMatch(html, pattern, RegexOptions.IgnoreCase))
|
|||
|
{
|
|||
|
Match match = Regex.Match(html, pattern, RegexOptions.IgnoreCase);
|
|||
|
if (match != null && match.Groups.Count > 0)
|
|||
|
{
|
|||
|
return match.Groups[1].Value.Trim();
|
|||
|
}
|
|||
|
}
|
|||
|
return string.Empty;
|
|||
|
}
|
|||
|
|
|||
|
/// <summary>
|
|||
|
/// 提取网页Title
|
|||
|
/// </summary>
|
|||
|
/// <param name="html">Html</param>
|
|||
|
/// <returns>返回Title</returns>
|
|||
|
internal static string GetHtmlTitle(string html)
|
|||
|
{
|
|||
|
if (Regex.IsMatch(html, RegexString.HtmlTitle))
|
|||
|
{
|
|||
|
return Regex.Match(html, RegexString.HtmlTitle).Groups[1].Value.Trim();
|
|||
|
}
|
|||
|
return string.Empty;
|
|||
|
}
|
|||
|
}
|
|||
|
}
|