2017年10月11日 星期三

以Java 由Url下載圖片

今天要來記錄如何使用Java來下載網路上的圖片(或檔案,此例只下載圖片),

Java可以對Url進行Http Request來取得Response,
得到input stream後將檔案儲存下來,

為了避免有些Server會擋程式的Request,
我們必須模仿瀏覽器,在Request中加上User-Agent的Header (或更多的其他Header,模仿的越像越不容易被擋),

如果圖片的Url是http的話比較簡單,
但如果是https的話,即SSL,那就要取得對方Server網站的憑證,
或是自己實作一個 X509TrustManager,來所有憑證檢查都通過,

如果是JDK 1.8 以下,可能會有此Exception :
Could not generate DH keypair
使用 JDK 1.8 就可解決此問題,
可參考 [Java] 處理無法透過SSL抓取網站資料的問題

以下為程式碼範例,說明都寫在註解中:


import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;

import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;

public class ImageDownloader {

 public static void main(String[] args) {

  String fileName_http = downloadImageFromUrl("http://www.image.com/files/8813/5551/7470/cruise-ship.png","D:" + File.separator, "HttpImgTest");
  System.out.println("Http的圖片下載: " + fileName_http);
  
  String fileName_https = downloadImageFromUrl("https://upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Plithocyon_armagnacensis.JPG/220px-Plithocyon_armagnacensis.JPG","D:" + File.separator, "HttpsImgTest");
  System.out.println("Https的圖片下載: " + fileName_https);

 }

 public static String downloadImageFromUrl(String url, String fileDirectoryPath, String fileNameWithoutFormat) {
  String filePath = null;
  
  BufferedInputStream in = null;
  ByteArrayOutputStream out = null;
  HttpURLConnection httpUrlConnection = null;
  FileOutputStream file = null;

  try {
   
   if (url.startsWith("https://")) {
    //HTTPS時
    httpUrlConnection = getHttpURLConnectionFromHttps(url);
   }
   //如果不是HTTPS或是沒成功得到httpUrlConnection,用HTTP的方法
   if(httpUrlConnection == null) {
    httpUrlConnection = (HttpURLConnection) (new URL(url)).openConnection();
   }
   
   // 設置User-Agent,偽裝成一般瀏覽器,不然有些伺服器會擋掉機器程式請求
   httpUrlConnection.setRequestProperty("User-Agent",
     "Mozilla/5.0 (Linux; Android 4.2.1; Nexus 7 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19");
   httpUrlConnection.connect();

   String imageType;
   if (httpUrlConnection.getResponseCode() == 200) {
    //成功取得response,
    //取得contentType
    String contentType = httpUrlConnection.getHeaderField("Content-Type");
    // 只處理image的回應
    if ("image".equals(contentType.substring(0, contentType.indexOf("/")))) {
     //得到對方Server提供的圖片副檔名,如jpg, png等
     imageType = contentType.substring(contentType.indexOf("/") + 1);

     if (imageType != null && !"".equals(imageType)) {
      //由HttpUrlConnection取得輸入串流
      in = new BufferedInputStream(httpUrlConnection.getInputStream());
      out = new ByteArrayOutputStream();

      //建立串流Buffer
      byte[] buffer = new byte[1024];

      file = new FileOutputStream(new File(fileDirectoryPath + File.separator + fileNameWithoutFormat + "." + imageType));

      int readByte;
      while ((readByte = in.read(buffer)) != -1) {
       //輸出檔案
       out.write(buffer, 0, readByte);
      }      

      byte[] response = out.toByteArray();
      file.write(response);      

      //下載成功後,返回檔案路徑
      filePath = fileDirectoryPath + File.separator + fileNameWithoutFormat + "." + imageType;
     }
    }

   }
  } catch (MalformedURLException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  } finally {
   //關閉各種串流
   try {
    if (out != null) {
     out.close();
    }
    if (in != null) {
     in.close();
    }
    if (httpUrlConnection != null) {
     httpUrlConnection.disconnect();
    }
    if (file != null) {
     file.close();
    }
   }catch (IOException e) {
    e.printStackTrace();
   }
   
  }
  return filePath;
 }

 public static HttpURLConnection getHttpURLConnectionFromHttps(String url) {
  HttpURLConnection httpUrlConnection = null;
  //建立一個信認所有憑證的X509TrustManager,放到TrustManager裡面
  TrustManager[] trustAllCerts;
  try {
   // Activate the new trust manager
   trustAllCerts = new TrustManager[] { new X509TrustManager() {

    public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {     // TODO Auto-generated method stub
     //不作任何事
    }

    public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {     // TODO Auto-generated method stub
     //不作任何事
    }

    public X509Certificate[] getAcceptedIssuers() {
     //不作任何事
     return null;
    }

   } };

   //設置SSL設定
   SSLContext sslContext = SSLContext.getInstance("SSL");
   sslContext.init(null, trustAllCerts, new java.security.SecureRandom());
   HttpsURLConnection.setDefaultSSLSocketFactory(sslContext.getSocketFactory());

   //跟HTTP一樣,用Url建立連線
   httpUrlConnection = (HttpURLConnection) (new URL(url)).openConnection();
  } catch (KeyManagementException e) {
   e.printStackTrace();
  } catch (NoSuchAlgorithmException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }
  
  return httpUrlConnection;
 }

}


源碼下載:
ImageDownloaderFromHttpOrHttps.7z

參考資料:

2017年10月5日 星期四

正規表逹式 - Java 範例

紀錄下Java的正規表示法使用方法範例:

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class RegTest {

 public static void main(String[] args) { 
  //(?i) 代表大小寫忽略
  //.*?為非貪婪演算,盡可能找最小範圍的結果值
  doRegularExpression("[ABC/def]kk[123/456][ABC/def]kk[123/456]", "(?i)\\[(a.*?)/(.*?)\\]");
  //.*為貪婪演算,盡可能找最大範圍的結果值
  doRegularExpression("[ABC/def]kk[123/456][ABC/def]kk[123/456]", "(?i)\\[(a.*)/(.*)\\]");
  
  /* 輸出為:
  ===========================
  測試的句子: [ABC/def]kk[123/456][ABC/def]kk[123/456]
  正規表示法: (?i)\[(a.*?)/(.*?)\]
  ---------------------------
  第1次匹配,找到2個Group:
  匹配的字串為: [ABC/def]
  第1個Group: ABC
  第2個Group: def
  ---------------------------
  第2次匹配,找到2個Group:
  匹配的字串為: [ABC/def]
  第1個Group: ABC
  第2個Group: def
  ===========================
  
  ===========================
  測試的句子: [ABC/def]kk[123/456][ABC/def]kk[123/456]
  正規表示法: (?i)\[(a.*)/(.*)\]
  ---------------------------
  第1次匹配,找到2個Group:
  匹配的字串為: [ABC/def]kk[123/456][ABC/def]kk[123/456]
  第1個Group: ABC/def]kk[123/456][ABC/def]kk[123
  第2個Group: 456
  ===========================
  */
 }
 
 static void doRegularExpression(String text, String regularExpression) {
  System.out.println("===========================");
  System.out.println("測試的句子: " + text);
  System.out.println("正規表示法: " + regularExpression);
  
  Pattern pattern = Pattern.compile(regularExpression);
  Matcher matcher = pattern.matcher(text);  
  
  
  for(int matchCount = 1 ; matcher.find(); matchCount++) {
   // groupCount不包括匹配的字串,即matcher.group(0)
   System.out.println("---------------------------");
   System.out.println("第" + matchCount + "次匹配,找到" + matcher.groupCount() + "個Group:"); 
   System.out.println("匹配的字串為: " + matcher.group(0));
   for(int groupCount = 1; groupCount <= matcher.groupCount(); groupCount++) {
          System.out.println("第" + groupCount + "個Group: " + matcher.group(groupCount));
      }
  }
  System.out.println("===========================");
  System.out.println();
 }

}