标签:java String links 批量 html new 壁纸 import
必应的壁纸一个一个下有点麻烦,写个小爬虫批量下载,代码如下:
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.io.*; import java.net.*; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 必应壁纸下载 * javac -encoding UTF-8 -cp .;C:\Users\administered\Downloads\jsoup-1.12.1.jar WallpaperDownload.java * java -cp .;C:\Users\administered\Downloads\jsoup-1.12.1.jar WallpaperDownload E:\\wallpaper */ public class WallpaperDownload { private static final String BY_PREFIX = "https://bing.ioliu.cn"; public static void main(String[] args) { // 运行时指定一个本地下载路径 String path = ""; for(int i=0; i<args.length; i++){ path = args[i]; } if(path == null || path.length()==0){ path = "E:\\wallpaper"; } File filePath = new File(path); if(!filePath.exists()){ System.out.println("创建目录:" + filePath.getName()); filePath.mkdirs(); } System.out.println("下载位置:" + filePath.getName()); download(path); } public static void download(String path) { long start = System.currentTimeMillis(); String pageHtml = "https://bing.ioliu.cn/ranking"; for (int i = 1; i <= 105; i++) { if (i > 1) { pageHtml = pageHtml + "?p=" + i; } try { String[] links = getAddress(pageHtml); execute(links, path); } catch (IOException e) { e.printStackTrace(); } pageHtml = "https://bing.ioliu.cn/ranking"; } long end = System.currentTimeMillis(); long time = (end - start) / 1000; System.out.println("下载耗时:" + time); } /** * 下载图片 * * @param links * @param path 下载位置 * @throws IOException */ public static void execute(String[] links, String path) throws IOException { if (!path.endsWith("\\")) { path = path + "\\"; } for (int i = 0; i < links.length; i++) { HttpURLConnection urlConnection = getConnection(links[i]); InputStream ins = urlConnection.getInputStream(); String imageName = links[i].substring(links[i].lastIndexOf("/") + 1).split("\\?")[0]; File file = new File(path + imageName + ".jpg"); OutputStream outputStream = null; if (!file.exists()) { outputStream = new FileOutputStream(file); int readCount; byte[] bytes = new byte[10240]; while ((readCount = ins.read(bytes)) != -1) { outputStream.write(bytes, 0, readCount); } System.out.println("[" + imageName + "] download finished ..."); } else { System.out.println(file.getName() + " existed ..."); } } } /** * 获取下载链接地址 * * @return * @throws IOException */ public static String[] getAddress(String htmlPage) throws IOException { System.out.println("get [" + htmlPage + "] info ..."); HttpURLConnection connection = getConnection(htmlPage); InputStream is = connection.getInputStream(); String newLine = System.getProperty("line.separator"); BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); StringBuilder result = new StringBuilder(); String line; String html; while ((line = reader.readLine()) != null) { result.append(line + newLine); } html = result.toString(); Document doc = Jsoup.parseBodyFragment(html); html = doc.body().html(); String[] links = extractLinks(html); return links; } /** * 提取图片链接 * * @param html */ static String[] extractLinks(String html) { List<String> list = new ArrayList<>(); String pattern = "/photo/.*_.*\\?force=download"; Pattern r = Pattern.compile(pattern); Matcher m = r.matcher(html); while (m.find()) { list.add(m.group()); } String[] results = new String[list.size()]; for (int i = 0; i < list.size(); i++) { results[i] = BY_PREFIX + list.get(i); } return results; } /** * 获取连接 * * @param urlStr * @return */ public static HttpURLConnection getConnection(String urlStr) { HttpURLConnection urlConnection = null; try { URI uri = new URI(urlStr); URL url = uri.toURL(); urlConnection = (HttpURLConnection) url.openConnection(); } catch (URISyntaxException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return urlConnection; } }
附:依赖HTML格式化jar(Jsoup)
标签:java,String,links,批量,html,new,壁纸,import 来源: https://www.cnblogs.com/rookiek/p/11359865.html
本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享; 2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关; 3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关; 4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除; 5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。