2017-03-18 105 views
1

我有一个(非常丑陋的)方法从网站获取页面以及页面上的所有图像。获取网页完全没有问题。但是当我获取这些图像时,它们会显得非常奇怪,并且不会像它们发送的那样。 我一直在用于测试的uri是这样的:http://www.themountaingoats.net/contact.html这个网页非常简单,并且拥有我需要测试的所有东西。如何通过相同的套接字连接发送图像和文本

使用\ r或\ n作为结束字符会给出不同的结果,而\ r \ n将无法打开图像。

public static String GET(String uri, int port) throws IOException { 

     String domain = uri.split("/",2)[0]; 
     String filename = uri.split("/",2)[1]; 
     Socket socket = new Socket(domain, port); 


     // send the command to the server. 
     System.out.println(socket.isConnected()); 
     DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream()); 
     BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream())); 
     String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
     System.out.println(request); 
     outToServer.writeBytes(request); 

     //create a file to write in. 
     File file = new File(domain+".txt"); 
     // if file doesnt exists, then create it 
     if (!file.exists()) { 
      file.createNewFile(); 
     } 
     PrintWriter writer = new PrintWriter(file); 
     writer.print(""); 
     writer.close(); 

     int characterCounter=100; 
     while(characterCounter >= 0){ 
      String serverSentence = inFromServer.readLine(); 
      System.out.println(serverSentence); 
      if (serverSentence.startsWith("Content-Length:")){ 
       characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
      } 
      if (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
        && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
        && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ")){ 
       characterCounter = characterCounter - serverSentence.length()-1; 
      } 

      //write in the file 
      FileWriter fw = new FileWriter(file.getAbsoluteFile(),true); 
      BufferedWriter bw = new BufferedWriter(fw); 
      bw.write(serverSentence+"\r\n"); 
      bw.close(); 
     } 


     Document doc = Jsoup.parse(file, "UTF-8"); 
     Elements imgs = doc.getElementsByTag("img"); 

     System.out.println(imgs); 


     for (Element link : imgs) { 
      String source = link.attr("src"); 

      source = source.replace("http://"+domain+"", ""); 

      System.out.println(source); 


      //create a file to write in. 
      File image = new File(source.replace("/", ".")); 
      // if file doesnt exists, then create it 
      if (!image.exists()) { 
       image.createNewFile(); 
      } 

      PrintWriter imageWriter = new PrintWriter(image); 
      imageWriter.print(""); 
      imageWriter.close(); 

      String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
      System.out.println(requestImage); 
      outToServer.writeBytes(requestImage); 

      boolean flag = false; 
      String previousServerSentence = "something not empty"; 
      characterCounter=100; 
      while(characterCounter > 0){ 
       String serverSentence = inFromServer.readLine(); 
       System.out.println(serverSentence); 
       if (serverSentence.startsWith("Content-Length:")){ 
        characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
       } 

       if (!flag){ 
        if (previousServerSentence.matches("") && !serverSentence.matches("")){ 
         flag = true; 
        } 
       } 

       if ((!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
         && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
         && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") && !serverSentence.startsWith("ETag: ") && !serverSentence.startsWith("Accept-Ranges: ") 
         && !serverSentence.startsWith("Accept-Language: ") && !serverSentence.startsWith("Accept-Datetime: ") && !serverSentence.startsWith("Authorization: ") 
         && !serverSentence.startsWith("Connection: ") && !serverSentence.startsWith("Content-Language: ") && !serverSentence.startsWith("Content-Length: ") 
         && !serverSentence.startsWith("Content-Location: ") && !serverSentence.startsWith("Content-MD5: ") && !serverSentence.startsWith("Content-Range: ") 
         && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("expect: ") 
         && !serverSentence.startsWith("From: ") && !serverSentence.startsWith("Host: ") && !serverSentence.startsWith("If-Match: ") && !serverSentence.startsWith("If-Modified-Since: ") 
         && !serverSentence.startsWith("Accept: ") && !serverSentence.startsWith("Accept-Charset: ") && !serverSentence.startsWith("Accept-Encoding: ") 
         && !serverSentence.startsWith("Age: ") && !serverSentence.startsWith("Allow: ") && !serverSentence.startsWith("Content-Encoding: ") 
         && !serverSentence.startsWith("If-None-Match: ") && !serverSentence.startsWith("If-Range: ") && !serverSentence.startsWith("If-Unmodified-Since: ") 
         && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Location: ") && !serverSentence.startsWith("Max-Forwards: ") 
         && !serverSentence.startsWith("Pragma: ") && !serverSentence.startsWith("Proxy-Authenticate: ") && !serverSentence.startsWith("Proxy-Authorization: ") 
         && !serverSentence.startsWith("Range: ") && !serverSentence.startsWith("Referer: ") && !serverSentence.startsWith("Retry-After: ") 
         && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("TE: ") && !serverSentence.startsWith("Trailer: ") 
         && !serverSentence.startsWith("Transfer-Encoding: ") && !serverSentence.startsWith("Upgrade: ") && !serverSentence.startsWith("User-Agent: ") 
         && !serverSentence.startsWith("Via: ") && !serverSentence.startsWith("Warning: ") && !serverSentence.startsWith("WWW-Authenticate: ")) 
         && flag){ 
        characterCounter = characterCounter - serverSentence.length()-1; 
        //write in the file 

        FileWriter fw = new FileWriter(image.getAbsoluteFile(),true); 
        BufferedWriter bw = new BufferedWriter(fw); 
        bw.write(serverSentence+"\r"); 
        bw.close(); 


       } 

       previousServerSentence = serverSentence; 
      } 


     } 
     return null; 
    } 

enter image description here enter image description here

enter image description here

第一图像是用于\ R作为底线,所述第二图像是用于\ n的底线和最后一个图像是原来的一个。我完全不知道为什么图像变得糟糕。

所以我的问题是:为什么会发生这种情况,我该如何解决它?

编辑:

public static String GET(String uri, int port) throws IOException { 

     /* 
     * Retrieval of the webpage 
     */ 

     String domain = uri.split("/",2)[0]; 
     String filename = uri.split("/",2)[1]; 
     Socket socket = new Socket(domain, port); 


     // send the command to the server. 
     System.out.println(socket.isConnected()); 
     DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream()); 
     BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream())); 
     String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
     System.out.println(request); 
     outToServer.writeBytes(request); 

     //create a file to write in. 
     File file = new File(domain+".txt"); 
     // if file doesnt exists, then create it 
     if (!file.exists()) { 
      file.createNewFile(); 
     } 
     PrintWriter writer = new PrintWriter(file); 
     writer.print(""); 
     writer.close(); 

     int characterCounter=100; 
     while(characterCounter >= 0){ 
      String serverSentence = inFromServer.readLine(); 
      System.out.println(serverSentence); 
      if (serverSentence.startsWith("Content-Length:")){ 
       characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
      } 
      if (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
        && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
        && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ")){ 
       characterCounter = characterCounter - serverSentence.length()-1; 
      } 

      //write in the file 
      FileWriter fw = new FileWriter(file.getAbsoluteFile(),true); 
      BufferedWriter bw = new BufferedWriter(fw); 
      bw.write(serverSentence+"\r\n"); 
      bw.close(); 
     } 

     /* 
     * Retrieval of all the embedded images on the webpage that are on the same domain. 
     */ 

     Document doc = Jsoup.parse(file, "UTF-8"); 
     Elements imgs = doc.getElementsByTag("img"); 

     System.out.println(imgs); 



     for (Element link : imgs) { 
      String source = link.attr("src"); 

      source = source.replace("http://"+domain+"", ""); 

      System.out.println(source); 

      //create a file to write in. 
      File image = new File(source.replace("/", ".")); 
      // if file doesnt exists, then create it 
      if (!image.exists()) { 
       image.createNewFile(); 
      } 

      // Initialize the streams. 
      final FileOutputStream fileOutputStream = new FileOutputStream(image); 
      final InputStream inputStream = socket.getInputStream(); 

      // Header end flag. 
      boolean headerEnded = false; 

      String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
      System.out.println(requestImage); 
      outToServer.writeBytes(requestImage); 

      int buffersize = 1000000; 
      byte[] bytes = new byte[buffersize]; 
      int length; 

      while ((length = inputStream.read(bytes)) != -1) { 
       // If the end of the header had already been reached, write the bytes to the file as normal. 
       if (headerEnded){ 

        fileOutputStream.write(bytes, 0, length); 
       } 
       // This locates the end of the header by comparing the current byte as well as the next 3 bytes 
       // with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10). 
       // If the end of the header is reached, the flag is set to true and the remaining data in the 
       // currently buffered byte array is written into the file. 
       else { 
        for (int i = 0; i < buffersize-3; i++) { 
         if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) { 
          headerEnded = true; 
          fileOutputStream.write(bytes, i+4 , buffersize-i-4); 
          break; 
         } 
        } 
       } 
      } 

      inputStream.close(); 
      fileOutputStream.close(); 

     } 
     socket.close(); 
     return null; 
    } 

这是我现在的结果是:图像

​​

我可以得到一部分,但不是整个画面。使用缓冲区大小可以让我稍微远一点,甚至更远一点。

EDIT2:我发现了错误。它只是与一些维度有关。 最后的工作代码:

public static String GET(String uri, int port) throws IOException { 

    /* 
    * Retrieval of the webpage 
    */ 

    String domain = uri.split("/",2)[0]; 
    String filename = uri.split("/",2)[1]; 
    Socket socket = new Socket(domain, port); 


    // send the command to the server. 
    System.out.println(socket.isConnected()); 
    DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream()); 
    BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream())); 
    String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
    System.out.println(request); 
    outToServer.writeBytes(request); 

    //create a file to write in. 
    File file = new File(domain+".txt"); 
    // if file doesnt exists, then create it 
    if (!file.exists()) { 
     file.createNewFile(); 
    } 
    PrintWriter writer = new PrintWriter(file); 
    writer.print(""); 
    writer.close(); 

    int characterCounter=100; 
    while(characterCounter >= 0){ 
     String serverSentence = inFromServer.readLine(); 
     System.out.println(serverSentence); 
     if (serverSentence.startsWith("Content-Length:")){ 
      characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ","")); 
     } 
     if (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ") 
       && !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ") 
       && !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ")){ 
      characterCounter = characterCounter - serverSentence.length()-1; 
     } 

     //write in the file 
     FileWriter fw = new FileWriter(file.getAbsoluteFile(),true); 
     BufferedWriter bw = new BufferedWriter(fw); 
     bw.write(serverSentence+"\r\n"); 
     bw.close(); 
    } 

    /* 
    * Retrieval of all the embedded images on the webpage that are on the same domain. 
    */ 

    Document doc = Jsoup.parse(file, "UTF-8"); 
    Elements imgs = doc.getElementsByTag("img"); 

    System.out.println(imgs); 


    for (Element link : imgs) { 

     // Getting the link ready for GET query. 

     String source = link.attr("src"); 

     source = source.replace("http://"+domain+"", ""); 

     System.out.println(source); 

     //create a file to write in. 
     File image = new File(source.replace("/", ".")); 
     // if file doesnt exists, then create it 
     if (!image.exists()) { 
      image.createNewFile(); 
     } 

     String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n"; 
     System.out.println(requestImage); 
     outToServer.writeBytes(requestImage); 

     // Initialize the streams. 
     final FileOutputStream fileOutputStream = new FileOutputStream(image); 
     final InputStream inputStream = socket.getInputStream(); 

     // Header end flag. 
     boolean headerEnded = false; 

     int buffersize = 10000; 
     byte[] bytes = new byte[buffersize]; 
     int length; 
     while ((length = inputStream.read(bytes)) != -1) { 
      // If the end of the header had already been reached, write the bytes to the file as normal. 
      if (headerEnded){ 
       fileOutputStream.write(bytes, 0, length); 
      } 
      // This locates the end of the header by comparing the current byte as well as the next 3 bytes 
      // with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10). 
      // If the end of the header is reached, the flag is set to true and the remaining data in the 
      // currently buffered byte array is written into the file. 
      else { 
       for (int i = 0; i < length-3; i++) { 
        if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) { 
         headerEnded = true; 
         fileOutputStream.write(bytes, i+4 , length-i-4); 
         break; 
        } 
       } 
      } 
     } 

     inputStream.close(); 
     fileOutputStream.close(); 

    } 
    socket.close(); 
    return null; 
} 
+0

你有使用插座,与其URLConnection的? – JCasso

+0

这是一个任务,我们必须使用套接字,不能使用任何像UrlConnection检索数据 – Sander

回答

1

避免使用原始套接字可能在处理HTTP请求。

见4ndrew的答案,如果你可以使用一个单独的连接以检索图像文件: https://stackoverflow.com/a/8679160/176873

如果你被卡住原始套接字,尽量避免使用java.io.BufferedReader中。 BufferedReader不应该被用来读取二进制数据。您正在将二进制数据转换为字符串并将文本文件写入本地PC。

见Alexay的答案解决方法: https://stackoverflow.com/a/34106534/176873

+0

你的第二个链接似乎几乎适用于我:我现在可以部分下载图像。不管我设置缓冲区有多大,我只能得到图像的前几行。如果我只是将缓冲区大小设置得非常大,则行数量似乎会减少。更改的代码在编辑我的问题 – Sander

+0

我接受了你的答案,因为它使我找到了一个解决方案,尽管我仍然不得不思考一点。非常感谢你。 – Sander