博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
利用HtmlAgilityPack抓取网站图片并下载
阅读量:7186 次
发布时间:2019-06-29

本文共 4361 字,大约阅读时间需要 14 分钟。

 

今日看博客园发现一个不错的抓取贴(主要是那个url。。。你懂的),花几分钟改了下,代码增加了按年月日建立目录,按文章建立子目录,图片都保存于内,命令行方式运行,增加了全站的参数。。。

原始版本:

 

老版本代码:  记住哦!,在E盘下新建一个DownLoadImg文件夹

主要代码如下:
WebClient wc = new WebClient();    private static int i = 0;    protected void Page_Load(object sender, EventArgs e)    {    }    protected void Button1_Click(object sender, EventArgs e)    {        HtmlWeb web = new HtmlWeb();        string imgurl = "";        //目前一般XX(你懂的...)网站,都是以.../版块/yyyymmdd/一堆数字.html结尾        //由于XX网站一般分为好几个版块,所以每个.html文件在同一版块下并不是连续的        //我用了外层两层循环,最外层循环yyyymmdd(日期如2012-02-15)        //内层是循环每个.html文件,当然你可以自己修改两个循环        for (int k = 20120215; k <= 20120215; k++)                                                        {          for (int j = 124289; j <= 124306; j++)        {                              string cnblogs = "";//看这里,需要填写某一网站的格式,在源码下载里有            HtmlDocument doc = web.Load(cnblogs);            HtmlNode node = doc.GetElementbyId("ks_xp");            if (node == null)            {                continue;            }            else            {                foreach (HtmlNode child in node.SelectNodes("//img"))                {                    if (child.Attributes["src"] == null)                        continue;                    imgurl = child.Attributes["src"].Value.ToString();                    DownLoadImg(imgurl);                }            }                 }      }    }

 

新版本代码:

 

#region Using namespace
using System;
using System.IO;
using System.Linq;
using System.Net;
using HtmlAgilityPack;
#endregion
namespace DownloadImages
{
    
internal 
class Program
    {
        
private 
static 
readonly WebClient Wc = 
new WebClient();
        
private 
static 
readonly 
char[] InvalidFileNameChars = 
new[]
                                                                  {
                                                                      
'
"
',
                                                                      
'
<
',
                                                                      
'
>
',
                                                                      
'
|
',
                                                                      
'
\0
',
                                                                      
'
\u0001
',
                                                                      
'
\u0002
',
                                                                      
'
\u0003
',
                                                                      
'
\u0004
',
                                                                      
'
\u0005
',
                                                                      
'
\u0006
',
                                                                      
'
\a
',
                                                                      
'
\b
',
                                                                      
'
\t
',
                                                                      
'
\n
',
                                                                      
'
\v
',
                                                                      
'
\f
',
                                                                      
'
\r
',
                                                                      
'
\u000e
',
                                                                      
'
\u000f
',
                                                                      
'
\u0010
',
                                                                      
'
\u0011
',
                                                                      
'
\u0012
',
                                                                      
'
\u0013
',
                                                                      
'
\u0014
',
                                                                      
'
\u0015
',
                                                                      
'
\u0016
',
                                                                      
'
\u0017
',
                                                                      
'
\u0018
',
                                                                      
'
\u0019
',
                                                                      
'
\u001a
',
                                                                      
'
\u001b
',
                                                                      
'
\u001c
',
                                                                      
'
\u001d
',
                                                                      
'
\u001e
',
                                                                      
'
\u001f
',
                                                                      
'
:
',
                                                                      
'
*
',
                                                                      
'
?
',
                                                                      
'
\\
',
                                                                      
'
/
'
                                                                  };
        
public 
static 
string CleanInvalidFileName(
string fileName)
        {
            fileName = fileName + 
"";
            fileName = InvalidFileNameChars.Aggregate(fileName, (current, c) => current.Replace(c + 
""
""));
            
if (fileName.Length > 
1)
                
if (fileName[
0] == 
'
.
')
                    fileName = 
"
dot
" + fileName.TrimStart(
'
.
');
            
return fileName;
        }
        
private 
static 
void Main(
string[] args)
        {
            Start();
        }
        
private 
static 
void Start()
        {
            
var web = 
new HtmlWeb();
            
var startDate = 
int.Parse(DateTime.Parse(
"
2010-08-18
").ToString(
"
yyyyMMdd
"));
            
var endDate = 
int.Parse(DateTime.Now.ToString(
"
yyyyMMdd
"));
            
const 
int startPageId = 
49430;
            
const 
int endPageId = 
124621;
            
for (
int k = startDate; k <= endDate; k++)
            {
                
for (
int j = startPageId; j <= endPageId; j++)
                {
                    
string cnblogs = 
 + k + 
"
/
" + j + 
"
.html
";  //此处省略……源码内详
                    HtmlDocument doc = web.Load(cnblogs);
                    
var titles = doc.DocumentNode.SelectNodes(
"
//title
");
                    
var titleName = j.ToString();
                    
if( titles!=
null && titles.Count>
0)
                        titleName = titles[
0].InnerText;
                    HtmlNode node = doc.GetElementbyId(
"
ks_xp
");
                    
if (node == 
null)
                    {
                        
continue;
                    }
                    
foreach (HtmlNode child 
in node.SelectNodes(
"
//img
"))
                    {
                        
if (child.Attributes[
"
src
"] == 
null)
                            
continue;
                        
string imgurl = child.Attributes[
"
src
"].Value;
                        DownLoadImg(imgurl, k + 
"", CleanInvalidFileName(titleName));
                        Console.WriteLine(
"
正在下载:
" + titleName + 
"
 
" + imgurl);
                    }
                }
            }
            
//
善后
            CleanEmptyFolders();
        }
        
private 
static 
void CleanEmptyFolders()
        {
            
var rootFolders = Environment.CurrentDirectory + 
"
\\Images\\
";
            
var folders = Directory.GetDirectories(rootFolders, 
"
*.*
", SearchOption.AllDirectories);
            
foreach
var f 
in folders)
            {
                
if (Directory.GetFiles(f, 
"
*.*
", SearchOption.AllDirectories).Length == 
0)
                    Directory.Delete(f);
            }
        }
        
private 
static 
void DownLoadImg(
string url, 
string folderName, 
string subFolderName)
        {
            
var fileName = CleanInvalidFileName(url.Substring(url.LastIndexOf(
"
/
") + 
1));
            
var fileFolder = Environment.CurrentDirectory + 
"
\\Images\\
" + folderName + 
"
\\
" + subFolderName + 
"
\\
" ;
            
if (!Directory.Exists(fileFolder))
                Directory.CreateDirectory(fileFolder);
            fileName = fileFolder + fileName;
            
try
            {
                Wc.DownloadFile(url, fileName);
            }
            
catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
    }
}
 

测试程序和源码下载:

转载于:https://www.cnblogs.com/wifi/articles/2473690.html

你可能感兴趣的文章
打开页面自动强制QQ对话
查看>>
php 代码设置跳转
查看>>
图的邻接表存储
查看>>
XSS绕过WAF的姿势
查看>>
[python] 类常用的内置方法
查看>>
01-kubernetes集群的基本概念
查看>>
vs2010 msvcr100.DLL 丢失!!! 用release 就可以了
查看>>
Spring RMI Example
查看>>
BZOJ1042: [HAOI2008]硬币购物
查看>>
sql面试题
查看>>
Linuxの安装SQL Developer工具
查看>>
Vue中axios的使用
查看>>
2048 控制台版(C#)
查看>>
[论文笔记] A Survey of Software Refactoring(TOSE, 2004)第一部分
查看>>
关于EOF和循环体的搭配使用。
查看>>
电梯调度--初
查看>>
C#与.NET
查看>>
2.7 Go channel
查看>>
jQuery代码实现表格内容可编辑修改
查看>>
app测试--DDMS的使用
查看>>