Connection.Response loginForm = Jsoup.connect("http://picturepub.net/index.php?login/login").method(Connection.Method.GET) 
    .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0").timeout(0).execute(); 

Document doc = Jsoup.connect("http://picturepub.net/index.php?login/login").data("cookieexists", "false").data("cookie_check", "1").data("login", "swordblazer") 
    .data("password", "picturepub").data("register", "0").data("redirect", "/index.php").cookies(loginForm.cookies()) 
    .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0").post(); 

doc = loginForm.parse(); 

Map<String, String> cookies = loginForm.cookies(); 

List<String> urls = new ArrayList<String>(); 
List<String> threadUrls = new ArrayList<String>(); 
int h = 0; 
for (int i = 1; i < 20; i++) { 
    if (i == 1) 
    doc = Jsoup.connect("http://picturepub.net/index.php?forums/photoshoots-magazines.51/") 
     .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0").cookies(cookies).get(); 
    doc = Jsoup.connect("http://picturepub.net/index.php?forums/photoshoots-magazines.51/page-" + i) 
     .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0").cookies(cookies).get(); 

    // get all links 
    Elements links = doc.select("a[href]"); 
    for (Element element : links) { 
    if (element.absUrl("href").contains("threads")) { 
     String linkImage = element.absUrl("href"); 
     Document document = Jsoup.connect(linkImage).cookies(cookies).get(); 

     if (!threadUrls.contains(linkImage)) { 


你得到'403'可能是因为你缺少一些参数/ cookie。如果您已经想出如何登录,而不是使用相同的方法来监控浏览器与网站之间的流量并查看您的浏览器正在发送的内容。 – TDG


我做到了。除了需要发送给服务器的cookie以外,还有其他什么吗? – user236928


Cookie和所需的参数。 – TDG



JSoup连接是彼此无关,所以他们不共享“登入状态/会话”。你必须仔细地复制它们之间的状态。你得到HTTP 403,因为有几个原因:

  • loginForm响应不返回身份验证cookie,你不能使用它们授权的资源,但是您以后使用这些Cookie。
  • 要获取身份验证Cookie,您必须从POST http://picturepub.net/index.php?login/login响应中获取Cookie,并且不要将其转换为文档。必须使用method(POST)将第二个请求声明为POST请求。
  • 失败的请求,Jsoup.connect(linkImage).cookies(cookies).get();未命中User-Agent


private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"; 
private static final String BASE_URL = "http://picturepub.net/index.php"; 
private static final int PAGE_COUNT = 20; 

static void grab() 
     throws IOException { 
    out.println("Getting the login form..."); 
    final Response getLoginFormResponse = prepareConnection(GET, "?login/login", emptyMap()) 
    out.println("Posting the login data..."); 
    // Avoid converting to document when it's unnecessary and use `execute()` 
    final Response postLoginFormResponse = prepareConnection(POST, "?login/login", getLoginFormResponse.cookies()) 
      .data("cookieexists", "false") 
      .data("cookie_check", "1") 
      .data("login", ...YOUR USERNAME...) 
      .data("password", ...YOUR PASSWORD...) 
      .data("register", "0") 
      .data("redirect", "/index.php") 
    // Obtain the authentication cookies 
    final Map<String, String> cookies = postLoginFormResponse.cookies(); 
    // If you want to discard duplicates, just don't use lists -- sets are designed for unique elements. 
    // The `h` is unnecessary because you can query the collection for its size: threadUrls.size() 
    final Collection<String> threadUrls = new LinkedHashSet<>(); 
    for (int i = 1; i <= PAGE_COUNT; i++) { 
     out.printf("Page #%d...\n", i); 
     final Document getPageDocument = prepareConnection(GET, "?forums/photoshoots-magazines.51/" + (i == 1 ? "" : "page-" + i), cookies) 
     out.printf("Page #%d: %s\n", i, getPageDocument.title()); 
     // `a[href*=threads/]` is a selector to obtain all links having the "threads/" in <A> element URLs -- no need to check for substring later 
     // The following code uses Java 8 streams to filter out duplicate links on the page 
     final Iterable<String> hrefs = getPageDocument.select("a[href*=threads/]") 
       .map(e -> e.absUrl("href")) 
     for (final String href : hrefs) { 
      out.printf("Probing: %s ... ", href); 
      final Response analyzeMeResponse = prepareConnection(GET, stripBaseUrl(href), cookies) 

private static String stripBaseUrl(final String url) 
     throws IllegalArgumentException { 
    if (!url.startsWith(BASE_URL)) { 
     // This must not happen for a well-written parser 
     throw new IllegalArgumentException(url); 
    return url.substring(BASE_URL.length()); 

// Just make sure that a particular connection is: 
// * bound to the BASE_URL defined above 
// * bound to a specific HTTP method 
// * follows redirects 
// * User-Agent is set 
// * cookies are always set 
private static Connection prepareConnection(final Method method, final String url, final Map<String, String> cookies) { 
    return connect(BASE_URL + url) 

上面的代码是基于org.jsoup:json:1.10.1自上次JSoup版本无法处理HTTP 307 Temporary Redirect所使用的那个网址。
