用 Java 爬小姐姐图片,这个厉害了…
点击下方“Java编程鸭”关注并标星
更多精彩 第一时间直达
来自:blog.csdn.net/qq_35402412/article/details/113627625
目的
准备工作
https://pic.sogou.com/pics?query=美女
分析
https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=美女
start=48
表示从第48张图片开始检索xml_len=48
从地48张往后获取48张图片query=?
搜索关键词(例:美女,这里浏览器自动做了转码,不影响我们使用)
思路
设置URL请求参数 访问URL请求,获取图片地址 图片地址存入List 遍历List,使用线程池下载到本地
代码
com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.utils.HttpClientUtils;
import victor.chang.crawler.pipeline.SougouImgPipeline;
import java.util.ArrayList;
import java.util.List;
/**
* A simple PageProcessor.
*
@author [email protected] <br>
*
@since 0.1.0
*/
publicclassSougouImgProcessor{
private String url;
private SougouImgPipeline pipeline;
private List<JSONObject> dataList;
private List<String> urlList;
private String word;
publicSougouImgProcessor(String url,String word){
this.url = url;
this.word = word;
this.pipeline =
new SougouImgPipeline();
this.dataList =
new ArrayList<>();
this.urlList =
new ArrayList<>();
}
publicvoidprocess(int idx, int size){
String res = HttpClientUtils.get(String.format(
this.url, idx, size,
this.word));
JSONObject object = JSONObject.parseObject(res);
List<JSONObject> items = (List<JSONObject>)((JSONObject)object.get(
"data")).get(
"items");
for(JSONObject item : items){
this.urlList.add(item.getString(
"picUrl"));
}
this.dataList.addAll(items);
}
// 下载 publicvoidpipelineData(){
// 多线程 pipeline.processSync(
this.urlList,
this.word);
}
publicstaticvoidmain(String[] args){
String url =
"https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
SougouImgProcessor processor =
new SougouImgProcessor(url,
"美女");
int start =
0, size =
50, limit =
1000;
// 定义爬取开始索引、每次爬取数量、总共爬取数量 for(
int i=start;i<start+limit;i+=size)
processor.process(i, size);
processor.pipelineData();
}
}
java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Store results in files.<br>
*
@author [email protected] <br>
*
@since 0.1.0
*/
publicclassSougouImgPipeline{
private String extension =
".jpg";
private String path;
privatevolatile AtomicInteger suc;
privatevolatile AtomicInteger fails;
publicSougouImgPipeline(){
setPath(
"E:/pipeline/sougou");
suc =
new AtomicInteger();
fails =
new AtomicInteger();
}
publicSougouImgPipeline(String path){
setPath(path);
suc =
new AtomicInteger();
fails =
new AtomicInteger();
}
publicSougouImgPipeline(String path, String extension){
setPath(path);
this.extension = extension;
suc =
new AtomicInteger();
fails =
new AtomicInteger();
}
publicvoidsetPath(String path){
this.path = path;
}
/**
* 下载
*
@param url
*
@param cate
*
@throws Exception
*/
privatevoiddownloadImg(String url, String cate, String name)throws Exception {
String path =
this.path +
"/" + cate +
"/";
File dir =
new File(path);
if (!dir.exists()) {
// 目录不存在则创建目录 dir.mkdirs();
}
String realExt = url.substring(url.lastIndexOf(
"."));
// 获取扩展名 String fileName = name + realExt;
fileName = fileName.replace(
"-",
"");
String filePath = path + fileName;
File img =
new File(filePath);
if(img.exists()){
// 若文件之前已经下载过,则跳过 System.out.println(String.format(
"文件%s已存在本地目录",fileName));
return;
}
URLConnection con =
new URL(url).openConnection();
con.setConnectTimeout(
5000);
con.setReadTimeout(
5000);
InputStream inputStream = con.getInputStream();
byte[] bs =
newbyte[
1024];
File file =
new File(filePath);
FileOutputStream os =
new FileOutputStream(file,
true);
// 开始读取 写入 int len;
while ((len = inputStream.read(bs)) != -
1) {
os.write(bs,
0, len);
}
System.out.println(
"picUrl: " + url);
System.out.println(String.format(
"正在下载第%s张图片", suc.getAndIncrement()));
}
/**
* 单线程处理
*
*
@param data
*
@param word
*/
publicvoidprocess(List<String> data, String word){
long start = System.currentTimeMillis();
for (String picUrl : data) {
if (picUrl ==
null)
continue;
try {
downloadImg(picUrl, word, picUrl);
}
catch (Exception e) {
fails.incrementAndGet();
}
}
System.out.println(
"下载成功: " + suc.get());
System.out.println(
"下载失败: " + fails.get());
long end = System.currentTimeMillis();
System.out.println(
"耗时:" + (end - start) /
1000 +
"秒");
}
/**
* 多线程处理
*
*
@param data
*
@param word
*/
publicvoidprocessSync(List<String> data, String word){
long start = System.currentTimeMillis();
int count =
0;
ExecutorService executorService = Executors.newCachedThreadPool();
// 创建缓存线程池 for (
int i=
0;i<data.size();i++) {
String picUrl = data.get(i);
if (picUrl ==
null)
continue;
String name =
"";
if(i<
10){
name=
"000"+i;
}
elseif(i<
100){
name=
"00"+i;
}
elseif(i<
1000){
name=
"0"+i;
}
String finalName = name;
executorService.execute(() -> {
try {
downloadImg(picUrl, word, finalName);
}
catch (Exception e) {
fails.incrementAndGet();
}
});
count++;
}
executorService.shutdown();
try {
if (!executorService.awaitTermination(
60, TimeUnit.SECONDS)) {
// 超时的时候向线程池中所有的线程发出中断(interrupted)。 // executorService.shutdownNow(); }
System.out.println(
"AwaitTermination Finished");
System.out.println(
"共有URL: "+data.size());
System.out.println(
"下载成功: " + suc);
System.out.println(
"下载失败: " + fails);
File dir =
new File(
this.path +
"/" + word +
"/");
int len = Objects.requireNonNull(dir.list()).length;
System.out.println(
"当前共有文件: "+len);
long end = System.currentTimeMillis();
System.out.println(
"耗时:" + (end - start) /
1000.0 +
"秒");
}
catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* 多线程分段处理
*
*
@param data
*
@param word
*
@param threadNum
*/
publicvoidprocessSync2(List<String> data, final String word, int threadNum){
if (data.size() < threadNum) {
process(data, word);
}
else {
ExecutorService executorService = Executors.newCachedThreadPool();
int num = data.size() / threadNum;
//每段要处理的数量 for (
int i =
0; i < threadNum; i++) {
int start = i * num;
int end = (i +
1) * num;
if (i == threadNum -
1) {
end = data.size();
}
final List<String> cutList = data.subList(start, end);
executorService.execute(() -> process(cutList, word));
}
executorService.shutdown();
}
}
}
org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.security.GeneralSecurityException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
*
@author * Date: 17/3/27
*/
publicabstractclassHttpClientUtils{
publicstatic Map<String, List<String>> convertHeaders(Header[] headers) {
Map<String, List<String>> results =
new HashMap<String, List<String>>();
for (Header header : headers) {
List<String> list = results.get(header.getName());
if (list ==
null) {
list =
new ArrayList<String>();
results.put(header.getName(), list);
}
list.add(header.getValue());
}
return results;
}
/**
* http的get请求
*
@param url
*/
publicstatic String get(String url){
return get(url,
"UTF-8");
}
publicstatic Logger logger = LoggerFactory.getLogger(HttpClientUtils
.class);
/**
* http的get请求
*
@param url
*/
publicstatic String get(String url, String charset){
HttpGet httpGet =
new HttpGet(url);
return executeRequest(httpGet, charset);
}
/**
* http的get请求,增加异步请求头参数
*
@param url
*/
publicstatic String ajaxGet(String url){
return ajaxGet(url,
"UTF-8");
}
/**
* http的get请求,增加异步请求头参数
*
*
@param url
*/
publicstatic String ajaxGet(String url, String charset){
HttpGet httpGet =
new HttpGet(url);
httpGet.setHeader(
"X-Requested-With",
"XMLHttpRequest");
return executeRequest(httpGet, charset);
}
/**
*
@param url
*
@return */
publicstatic String ajaxGet(CloseableHttpClient httpclient, String url){
HttpGet httpGet =
new HttpGet(url);
httpGet.setHeader(
"X-Requested-With",
"XMLHttpRequest");
return executeRequest(httpclient, httpGet,
"UTF-8");
}
/**
* http的post请求,传递map格式参数
*/
publicstatic String post(String url, Map<String, String> dataMap){
return post(url, dataMap,
"UTF-8");
}
/**
* http的post请求,传递map格式参数
*/
publicstatic String post(String url, Map<String, String> dataMap, String charset){
HttpPost httpPost =
new HttpPost(url);
try {
if (dataMap !=
null) {
List<NameValuePair> nvps =
new ArrayList<NameValuePair>();
for (Map.Entry<String, String> entry : dataMap.entrySet()) {
nvps.add(
new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
UrlEncodedFormEntity formEntity =
new UrlEncodedFormEntity(nvps, charset);
formEntity.setContentEncoding(charset);
httpPost.setEntity(formEntity);
}
}
catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return executeRequest(httpPost, charset);
}
/**
* http的post请求,增加异步请求头参数,传递map格式参数
*/
publicstatic String ajaxPost(String url, Map<String, String> dataMap){
return ajaxPost(url, dataMap,
"UTF-8");
}
/**
* http的post请求,增加异步请求头参数,传递map格式参数
*/
publicstatic String ajaxPost(String url, Map<String, String> dataMap, String charset){
HttpPost httpPost =
new HttpPost(url);
httpPost.setHeader(
"X-Requested-With",
"XMLHttpRequest");
try {
if (dataMap !=
null) {
List<NameValuePair> nvps =
new ArrayList<NameValuePair>();
for (Map.Entry<String, String> entry : dataMap.entrySet()) {
nvps.add(
new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
UrlEncodedFormEntity formEntity =
new UrlEncodedFormEntity(nvps, charset);
formEntity.setContentEncoding(charset);
httpPost.setEntity(formEntity);
}
}
catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return executeRequest(httpPost, charset);
}
/**
* http的post请求,增加异步请求头参数,传递json格式参数
*/
publicstatic String ajaxPostJson(String url, String jsonString){
return ajaxPostJson(url, jsonString,
"UTF-8");
}
/**
* http的post请求,增加异步请求头参数,传递json格式参数
*/
publicstatic String ajaxPostJson(String url, String jsonString, String charset){
HttpPost httpPost =
new HttpPost(url);
httpPost.setHeader(
"X-Requested-With",
"XMLHttpRequest");
StringEntity stringEntity =
new StringEntity(jsonString, charset);
// 解决中文乱码问题 stringEntity.setContentEncoding(charset);
stringEntity.setContentType(
"application/json");
httpPost.setEntity(stringEntity);
return executeRequest(httpPost, charset);
}
/**
* 执行一个http请求,传递HttpGet或HttpPost参数
*/
publicstatic String executeRequest(HttpUriRequest httpRequest){
return executeRequest(httpRequest,
"UTF-8");
}
/**
* 执行一个http请求,传递HttpGet或HttpPost参数
*/
publicstatic String executeRequest(HttpUriRequest httpRequest, String charset){
CloseableHttpClient httpclient;
if (
"https".equals(httpRequest.getURI().getScheme())) {
httpclient = createSSLInsecureClient();
}
else {
httpclient = HttpClients.createDefault();
}
String result =
"";
try {
try {
CloseableHttpResponse response = httpclient.execute(httpRequest);
HttpEntity entity =
null;
try {
entity = response.getEntity();
result = EntityUtils.toString(entity, charset);
}
finally {
EntityUtils.consume(entity);
response.close();
}
}
finally {
httpclient.close();
}
}
catch (IOException ex) {
ex.printStackTrace();
}
return result;
}
publicstatic String executeRequest(CloseableHttpClient httpclient, HttpUriRequest httpRequest, String charset){
String result =
"";
try {
try {
CloseableHttpResponse response = httpclient.execute(httpRequest);
HttpEntity entity =
null;
try {
entity = response.getEntity();
result = EntityUtils.toString(entity, charset);
}
finally {
EntityUtils.consume(entity);
response.close();
}
}
finally {
httpclient.close();
}
}
catch (IOException ex) {
ex.printStackTrace();
}
return result;
}
/**
* 创建 SSL连接
*/
publicstatic CloseableHttpClient createSSLInsecureClient(){
try {
SSLContext sslContext =
new SSLContextBuilder().loadTrustMaterial(
new TrustStrategy() {
@OverridepublicbooleanisTrusted(X509Certificate[] chain, String authType)throws CertificateException {
returntrue;
}
}).build();
SSLConnectionSocketFactory sslsf =
new SSLConnectionSocketFactory(sslContext,
new HostnameVerifier() {
@Overridepublicbooleanverify(String hostname, SSLSession session){
returntrue;
}
});
return HttpClients.custom().setSSLSocketFactory(sslsf).build();
}
catch (GeneralSecurityException ex) {
thrownew RuntimeException(ex);
}
}
}
运行
END
看完本文有收获?请转发分享给更多人
关注「Java编程鸭」,提升Java技能
关注Java编程鸭微信公众号,后台回复:码农大礼包可以获取最新整理的技术资料一份。涵盖Java 框架学习、架构师学习等!
文章有帮助的话,在看,转发吧。
谢谢支持哟 (*^__^*)
最新评论
推荐文章
作者最新文章
你可能感兴趣的文章
Copyright Disclaimer: The copyright of contents (including texts, images, videos and audios) posted above belong to the User who shared or the third-party website which the User shared from. If you found your copyright have been infringed, please send a DMCA takedown notice to [email protected]. For more detail of the source, please click on the button "Read Original Post" below. For other communications, please send to [email protected].
版权声明:以上内容为用户推荐收藏至CareerEngine平台,其内容(含文字、图片、视频、音频等)及知识版权均属用户或用户转发自的第三方网站,如涉嫌侵权,请通知[email protected]进行信息删除。如需查看信息来源,请点击“查看原文”。如需洽谈其它事宜,请联系[email protected]。
版权声明:以上内容为用户推荐收藏至CareerEngine平台,其内容(含文字、图片、视频、音频等)及知识版权均属用户或用户转发自的第三方网站,如涉嫌侵权,请通知[email protected]进行信息删除。如需查看信息来源,请点击“查看原文”。如需洽谈其它事宜,请联系[email protected]。