using CsharpHttpHelper.Enum;
using CsharpHttpHelper.Item;
using CsharpHttpHelper.Static;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace CsharpHttpHelper.Helper
{
///
/// Html操作相关 Copyright:http://www.httphelper.com/
///
internal class HtmlHelper
{
///
/// 获取所有的A链接
///
/// 要分析的Html代码
/// 返回一个List存储所有的A标签
internal static List GetAList(string html)
{
List list = null;
string alist = RegexString.Alist;
if (Regex.IsMatch(html, alist, RegexOptions.IgnoreCase))
{
list = new List();
foreach (Match item in Regex.Matches(html, alist, RegexOptions.IgnoreCase))
{
AItem aItem = null;
try
{
AItem aItem2 = new AItem();
aItem2.Href = item.Groups[1].Value;
aItem2.Text = item.Groups[2].Value;
aItem2.Html = item.Value;
aItem2.Type = AType.Text;
aItem = aItem2;
List imgList = GetImgList(aItem.Text);
if (imgList != null && imgList.Count > 0)
{
aItem.Type = AType.Img;
aItem.Img = imgList[0];
}
}
catch
{
aItem = null;
}
if (aItem != null)
{
list.Add(aItem);
}
}
}
return list;
}
///
/// 获取所有的Img标签
///
/// 要分析的Html代码
/// 返回一个List存储所有的Img标签
internal static List GetImgList(string html)
{
List list = null;
string imgList = RegexString.ImgList;
if (Regex.IsMatch(html, imgList, RegexOptions.IgnoreCase))
{
list = new List();
foreach (Match item in Regex.Matches(html, imgList, RegexOptions.IgnoreCase))
{
ImgItem imgItem = null;
try
{
ImgItem imgItem2 = new ImgItem();
imgItem2.Src = item.Groups[1].Value;
imgItem2.Html = item.Value;
imgItem = imgItem2;
}
catch
{
imgItem = null;
}
if (imgItem != null)
{
list.Add(imgItem);
}
}
}
return list;
}
///
/// 过滤html标签
///
/// html的内容
/// 处理后的文本
internal static string StripHTML(string html)
{
html = Regex.Replace(html, RegexString.Nscript, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
html = Regex.Replace(html, RegexString.Style, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
html = Regex.Replace(html, RegexString.Script, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
html = Regex.Replace(html, RegexString.Html, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
return html;
}
///
/// 过滤html中所有的换行符号
///
/// html的内容
/// 处理后的文本
internal static string ReplaceNewLine(string html)
{
return Regex.Replace(html, RegexString.NewLine, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
///
/// 提取Html字符串中两字符之间的数据
///
/// 源Html
/// 开始字符串
/// 结束字符串
///
internal static string GetBetweenHtml(string html, string s, string e)
{
string pattern = $"{s}{RegexString.AllHtml}{e}";
if (Regex.IsMatch(html, pattern, RegexOptions.IgnoreCase))
{
Match match = Regex.Match(html, pattern, RegexOptions.IgnoreCase);
if (match != null && match.Groups.Count > 0)
{
return match.Groups[1].Value.Trim();
}
}
return string.Empty;
}
///
/// 提取网页Title
///
/// Html
/// 返回Title
internal static string GetHtmlTitle(string html)
{
if (Regex.IsMatch(html, RegexString.HtmlTitle))
{
return Regex.Match(html, RegexString.HtmlTitle).Groups[1].Value.Trim();
}
return string.Empty;
}
}
}