今日看博客园发现一个不错的抓取贴(主要是那个url。。。你懂的),花几分钟改了下,代码增加了按年月日建立目录,按文章建立子目录,图片都保存于内,命令行方式运行,增加了全站的参数。。。
原始版本:
老版本代码: 记住哦!,在E盘下新建一个DownLoadImg文件夹
主要代码如下:
WebClient wc = new WebClient(); private static int i = 0; protected void Page_Load(object sender, EventArgs e) { } protected void Button1_Click(object sender, EventArgs e) { HtmlWeb web = new HtmlWeb(); string imgurl = ""; //目前一般XX(你懂的...)网站,都是以.../版块/yyyymmdd/一堆数字.html结尾 //由于XX网站一般分为好几个版块,所以每个.html文件在同一版块下并不是连续的 //我用了外层两层循环,最外层循环yyyymmdd(日期如2012-02-15) //内层是循环每个.html文件,当然你可以自己修改两个循环 for (int k = 20120215; k <= 20120215; k++) { for (int j = 124289; j <= 124306; j++) { string cnblogs = "";//看这里,需要填写某一网站的格式,在源码下载里有 HtmlDocument doc = web.Load(cnblogs); HtmlNode node = doc.GetElementbyId("ks_xp"); if (node == null) { continue; } else { foreach (HtmlNode child in node.SelectNodes("//img")) { if (child.Attributes["src"] == null) continue; imgurl = child.Attributes["src"].Value.ToString(); DownLoadImg(imgurl); } } } } }
新版本代码:
#region Using namespace using System; using System.IO; using System.Linq; using System.Net; using HtmlAgilityPack; #endregion namespace DownloadImages { internal class Program { private static readonly WebClient Wc = new WebClient(); private static readonly char[] InvalidFileNameChars = new[] { ' " ', ' < ', ' > ', ' | ', ' \0 ', ' \u0001 ', ' \u0002 ', ' \u0003 ', ' \u0004 ', ' \u0005 ', ' \u0006 ', ' \a ', ' \b ', ' \t ', ' \n ', ' \v ', ' \f ', ' \r ', ' \u000e ', ' \u000f ', ' \u0010 ', ' \u0011 ', ' \u0012 ', ' \u0013 ', ' \u0014 ', ' \u0015 ', ' \u0016 ', ' \u0017 ', ' \u0018 ', ' \u0019 ', ' \u001a ', ' \u001b ', ' \u001c ', ' \u001d ', ' \u001e ', ' \u001f ', ' : ', ' * ', ' ? ', ' \\ ', ' / ' }; public static string CleanInvalidFileName( string fileName) { fileName = fileName + ""; fileName = InvalidFileNameChars.Aggregate(fileName, (current, c) => current.Replace(c + "", "")); if (fileName.Length > 1) if (fileName[ 0] == ' . ') fileName = " dot " + fileName.TrimStart( ' . '); return fileName; } private static void Main( string[] args) { Start(); } private static void Start() { var web = new HtmlWeb(); var startDate = int.Parse(DateTime.Parse( " 2010-08-18 ").ToString( " yyyyMMdd ")); var endDate = int.Parse(DateTime.Now.ToString( " yyyyMMdd ")); const int startPageId = 49430; const int endPageId = 124621; for ( int k = startDate; k <= endDate; k++) { for ( int j = startPageId; j <= endPageId; j++) { string cnblogs = + k + " / " + j + " .html "; //此处省略……源码内详 HtmlDocument doc = web.Load(cnblogs); var titles = doc.DocumentNode.SelectNodes( " //title "); var titleName = j.ToString(); if( titles!= null && titles.Count> 0) titleName = titles[ 0].InnerText; HtmlNode node = doc.GetElementbyId( " ks_xp "); if (node == null) { continue; } foreach (HtmlNode child in node.SelectNodes( " //img ")) { if (child.Attributes[ " src "] == null) continue; string imgurl = child.Attributes[ " src "].Value; DownLoadImg(imgurl, k + "", CleanInvalidFileName(titleName)); Console.WriteLine( " 正在下载: " + titleName + " " + imgurl); } } } // 善后 CleanEmptyFolders(); } private static void CleanEmptyFolders() { var rootFolders = Environment.CurrentDirectory + " \\Images\\ "; var folders = Directory.GetDirectories(rootFolders, " *.* ", SearchOption.AllDirectories); foreach( var f in folders) { if (Directory.GetFiles(f, " *.* ", SearchOption.AllDirectories).Length == 0) Directory.Delete(f); } } private static void DownLoadImg( string url, string folderName, string subFolderName) { var fileName = CleanInvalidFileName(url.Substring(url.LastIndexOf( " / ") + 1)); var fileFolder = Environment.CurrentDirectory + " \\Images\\ " + folderName + " \\ " + subFolderName + " \\ " ; if (!Directory.Exists(fileFolder)) Directory.CreateDirectory(fileFolder); fileName = fileFolder + fileName; try { Wc.DownloadFile(url, fileName); } catch (Exception ex) { Console.WriteLine(ex.Message); } } } }
测试程序和源码下载: