2016-01-19 131 views
0

我试图从维基百科下载数据。我发送一个GET请求,但返回只包含页面状态和一些HTML细节。无法通过Wikipedia API下载数据

我在做什么错?

#include <winsock2.h> 
#include <WS2tcpip.h> 
#include <windows.h> 
#include <iostream> 

int main(){ 

WSADATA wsaData; 
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) { 
    cout << "WSAStartup failed.\n"; 
    system("pause"); 
    return -1; 
} 

struct addrinfo hints; 
ZeroMemory(&hints, sizeof(hints)); 
hints.ai_family = AF_INET;   
hints.ai_protocol = IPPROTO_TCP;  
hints.ai_socktype = SOCK_STREAM;  


struct addrinfo* targetAdressInfo = NULL; 
DWORD getAddrRes = getaddrinfo("www.wikipedia.org", NULL, &hints, &targetAdressInfo); 
if (getAddrRes != 0 || targetAdressInfo == NULL) 
{ 
    cout << "Could not resolve the Host Name" << endl; 
    system("pause"); 
    WSACleanup(); 
    return -1; 
} 

SOCKADDR_IN sockAddr; 
sockAddr.sin_addr = ((struct sockaddr_in*) targetAdressInfo->ai_addr)->sin_addr; 
sockAddr.sin_family = AF_INET; 
sockAddr.sin_port = htons(80); 

freeaddrinfo(targetAdressInfo); 

SOCKET webSocket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); 
if (webSocket == INVALID_SOCKET) 
{ 
    cout << "Creation of the Socket Failed" << endl; 
    system("pause"); 
    WSACleanup(); 
    return -1; 
} 

if (connect(webSocket, (SOCKADDR*)&sockAddr, sizeof(sockAddr)) != 0) 
{ 
    cout << "Could not connect"; 
    system("pause"); 
    closesocket(webSocket); 
    WSACleanup(); 
    return -1; 
} 

// Sending a HTTP-GET-Request to the Web Server 
const char* httpRequest = "GET/http://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=Google HTTP/1.1\r\n\r\n"; 
int sentBytes = send(webSocket, httpRequest, strlen(httpRequest), 0); 
if (sentBytes < strlen(httpRequest) || sentBytes == SOCKET_ERROR) 
{ 
    cout << "Could not send the request to the Server" << endl; 
    system("pause"); 
    closesocket(webSocket); 
    WSACleanup(); 
    return -1; 
} 

char buffer[1000000]; 

ZeroMemory(buffer, sizeof(buffer)); 
int dataLen; 
while ((dataLen = recv(webSocket, buffer, sizeof(buffer), 0) > 0)) 
{ 
    int i = 0; 
    while (buffer[i] >= 32 || buffer[i] == '\n' || buffer[i] == '\r') { 
     cout << buffer[i]; 
     i += 1; 
    } 
} 

closesocket(webSocket); 
WSACleanup(); 

system("pause"); 
return 0; 
} 
+0

您的GET请求似乎格式错误。我建议使用工具(例如Wireshark)来检查浏览器发送的标题。 –

+0

您必须使用HTTPS。此外,询问有关错误消息的问题,但不说明错误消息的含义并不是很有帮助。 – Tgr

回答

0

如果我记得维基百科会发给你一个HTTP 301 redirection回复。

您需要解析以获取目标网址,然后才能访问文章内容。

您可以看到一些示例代码,用于执行此操作here

+0

非常感谢,现在我会试试 –

+0

我尝试使用示例来获得HTTP 301重定向,但是当我在wiki_fetch_raw函数中执行解析任务时,我没有得到任何页面的主体? –

+0

您需要在您从第一个HTTP请求获取的目标URL处执行第二个HTTP请求。请注意,在示例代码中,wiki_fetch_raw函数是递归的,以便处理多个重定向级别。 –

0

如何使用python脚本?

import urllib2 
def get_page(url): 
    request = urllib2.Request(url) 
    request = urllib2.urlopen(request) 
    data = request.read() 
    return data 
url = "http://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=Google HTTP/1.1\r\n\r\n" 
print get_page(url) 
+0

Python是我的BFF,但在项目的其余部分,我需要使用C++ –