2016-01-25 63 views
1

我正在使用python 3.4。我知道如何利用BeautifulSoup来抓取网页,但我正在尝试提出最有效的方法来完成此操作。 Nexus factory image page(Android)包含所有Nexus设备的列表,并在新版本可用时更新。最新的版本总是添加到相应表格的底部。我列出了每个设备的名称,包括真实姓名和代码名称,并且我只提取这些设备(设备本身只更新一次/年,如果有的话,只有部分设备仍然会收到更新)。刮掉表格底部的行

什么是最有效的方式拉出每个表的底部条目?我打算将底部行中第一个<td>的每个字符串保存为pickled对象,以便稍后可以轻松地比较字符串以检查当前最下面一行是否是新的,但我不确定最佳方法是什么条目本身。

每个<tr>都有一个格式为devnamebuildnumber的ID。由于我有每个设备的名称,并将有最新的字符串,我应该能够通过使用soup.find("tr", id=dev + buildstring)来搜索。然而,这将返回找到的行的每一个兄弟姐妹和孩子,所以我不知道如何最好地利用它。

回答

2

这是让你开始的东西。这个想法是让h2元素具有id属性 - 除了第一个元素是设备名称元素。对于找到的每个元素,我们获取下一个table元素并将版本解析为列表。执行:

from pprint import pprint 

import requests 
from bs4 import BeautifulSoup 


url = "https://developers.google.com/android/nexus/images" 
response = requests.get(url) 

soup = BeautifulSoup(response.content, "lxml") 

data = {} 
for device in soup.find_all("h2", id=True)[1:]: 
    device_name = device.get_text(strip=True) 

    data[device_name] = [version.find("td").get_text(strip=True) 
         for version in device.find_next("table").find_all("tr", id=True)] 

pprint(data) 

与打印设备名称为键和版本的字典作为值:

{'"angler" for Nexus 6P': ['6.0.0 (MDA89D)', 
          '6.0.0 (MDB08K)', 
          '6.0.0 (MDB08L)', 
          '6.0.0 (MDB08M)', 
          '6.0.0 (MMB29N)', 
          '6.0.1 (MMB29M)', 
          '6.0.1 (MMB29P)'], 
'"bullhead" for Nexus 5X': ['6.0.0 (MDA89E)', 
          '6.0.0 (MDB08I)', 
          '6.0.0 (MDB08L)', 
          '6.0.0 (MDB08M)', 
          '6.0.1 (MMB29K)', 
          '6.0.1 (MMB29P)'], 
'"fugu" for Nexus Player': ['5.0 (LRX21M)', 
          '5.0 (LRX21V)', 
          '5.1.0 (LMY47D)', 
          '5.1.1 (LMY47V)', 
          '5.1.1 (LMY48J)', 
          '5.1.1 (LMY48N)', 
          '6.0.0 (MRA58K)', 
          '6.0.0 (MRA58N)', 
          '6.0.1 (MMB29M)', 
          '6.0.1 (MMB29T)'], 
'"hammerhead" for Nexus 5 (GSM/LTE)': ['4.4 (KRT16M)', 
             '4.4.2 (KOT49H)', 
             '4.4.3 (KTU84M)', 
             '4.4.4 (KTU84P)', 
             '4.4.4 Release 2 (For 2Degrees/NZ, ' 
             'Telstra/AUS and India ONLY) (KTU84Q)', 
             '5.0 (LRX21O)', 
             '5.0.1 (LRX22C)', 
             '5.1.0 (LMY47D)', 
             '5.1.0 (LMY47I)', 
             '5.1.1 (LMY48B)', 
             '5.1.1 (LMY48I)', 
             '5.1.1 (LMY48M)', 
             '6.0.0 (MRA58K)', 
             '6.0.0 (MRA58N)', 
             '6.0.1 (MMB29K)', 
             '6.0.1 (MMB29S)'], 
'"mantaray" for Nexus 10': ['4.2.2 (JDQ39)', 
          '4.3 (JWR66Y)', 
          '4.4 (KRT16S)', 
          '4.4.2 (KOT49H)', 
          '4.4.3 (KTU84L)', 
          '4.4.4 (KTU84P)', 
          '5.0 (LRX21P)', 
          '5.0.1 (LRX22C)', 
          '5.0.2 (LRX22G)', 
          '5.1.0 (LMY47D)', 
          '5.1.1 (LMY47V)', 
          '5.1.1 (LMY48I)', 
          '5.1.1 (LMY48M)', 
          '5.1.1 (LMY48T)', 
          '5.1.1 (LMY48X)', 
          '5.1.1 (LMY48Z)', 
          '5.1.1 (LMY49F)'], 
'"mysid" for Galaxy Nexus "toro" (Verizon CDMA/LTE)': ['4.0.4 (IMM76K)', 
                 '4.1.1 (JRO03O)', 
                 '4.2.2 (JDQ39)'], 
'"mysidspr" for Galaxy Nexus "toroplus" (Sprint CDMA/LTE)': ['4.1.1 (FH05)', 
                   '4.2.1 (GA02)'], 
'"nakasi" for Nexus 7 (Wi-Fi)': ['4.1.2 (JZO54K)', 
            '4.2.2 (JDQ39)', 
            '4.3 (JWR66Y)', 
            '4.4 (KRT16S)', 
            '4.4.2 (KOT49H)', 
            '4.4.3 (KTU84L)', 
            '4.4.4 (KTU84P)', 
            '5.0 (LRX21P)', 
            '5.0.2 (LRX22G)', 
            '5.1.0 (LMY47D)', 
            '5.1.1 (LMY47V)'], 
'"nakasig" for Nexus 7 (Mobile)': ['4.2.2 (JDQ39)', 
            '4.3 (JWR66Y)', 
            '4.4 (KRT16S)', 
            '4.4.2 (KOT49H)', 
            '4.4.3 (KTU84L)', 
            '4.4.4 (KTU84P)', 
            '5.0.2 (LRX22G)', 
            '5.1.0 (LMY47D)', 
            '5.1.1 (LMY47V)'], 
'"occam" for Nexus 4': ['4.2.2 (JDQ39)', 
         '4.3 (JWR66Y)', 
         '4.4 (KRT16S)', 
         '4.4.2 (KOT49H)', 
         '4.4.3 (KTU84L)', 
         '4.4.4 (KTU84P)', 
         '5.0 (LRX21T)', 
         '5.0.1 (LRX22C)', 
         '5.1.0 (LMY47O)', 
         '5.1.1 (LMY47V)', 
         '5.1.1 (LMY48I)', 
         '5.1.1 (LMY48M)', 
         '5.1.1 (LMY48T)'], 
'"razor" for Nexus 7 [2013] (Wi-Fi)': ['4.3 (JSS15Q)', 
             '4.3 (JSS15R)', 
             '4.4 (KRT16S)', 
             '4.4.2 (KOT49H)', 
             '4.4.3 (KTU84L)', 
             '4.4.4 (KTU84P)', 
             '5.0 (LRX21P)', 
             '5.0.1 (LRX22C)', 
             '5.0.2 (LRX22G)', 
             '5.1.0 (LMY47O)', 
             '5.1.1 (LMY47V)', 
             '5.1.1 (LMY48G)', 
             '5.1.1 (LMY48I)', 
             '5.1.1 (LMY48M)', 
             '5.1.1 (LMY48T)', 
             '6.0.0 (MRA58K)', 
             '6.0.0 (MRA58U)', 
             '6.0.0 (MRA58V)', 
             '6.0.1 (MMB29K)', 
             '6.0.1 (MMB29O)'], 
'"razorg" for Nexus 7 [2013] (Mobile)': ['4.3 (JLS36C)', 
              '4.3.1 (JLS36I)', 
              '4.4 (KRT16S)', 
              '4.4.2 (KOT49H)', 
              '4.4.2_r2 (Verizon) (KVT49L)', 
              '4.4.3 (KTU84L)', 
              '4.4.4 (KTU84P)', 
              '5.0.2 (LRX22G)', 
              '5.1.0 (LMY47O)', 
              '5.1.1 (LMY47V)', 
              '5.1.1 (LMY48P)', 
              '5.1.1 (LMY48U)', 
              '5.1.1 (LMY48X)', 
              '5.1.1 (LMY48Z)', 
              '6.0.0 (MRA58K)', 
              '6.0.0 (MRA58N)', 
              '6.0.0 (MRA58V)', 
              '6.0.0 (MRA59B)', 
              '6.0.1 (MMB29K)', 
              '6.0.1 (MMB29O)'], 
'"ryu" for Pixel C': ['6.0.1 (MXB48J)', '6.0.1 (MXB48K)'], 
'"shamu" for Nexus 6': ['5.0 (LRX21O)', 
         '5.0.1 (LRX22C)', 
         '5.1.0 (LMY47D)', 
         '5.1.0 (LMY47E)', 
         '5.1.0 (LMY47I)', 
         '5.1.0 (For T-Mobile ONLY) (LMY47M)', 
         '5.1.1 (All carriers except T-Mobile US) (LMY47Z)', 
         '5.1.1 (For T-Mobile ONLY) (LYZ28E)', 
         '5.1.1 (For Project Fi ONLY) (LVY48C)', 
         '5.1.1 (LMY48I)', 
         '5.1.1 (For T-Mobile ONLY) (LYZ28J)', 
         '5.1.1 (For Project Fi ONLY) (LVY48E)', 
         '5.1.1 (LMY48M)', 
         '5.1.1 (For T-Mobile ONLY) (LYZ28K)', 
         '5.1.1 (For Project Fi ONLY) (LVY48F)', 
         '5.1.1 (LMY48T)', 
         '5.1.1 (For T-Mobile ONLY) (LYZ28M)', 
         '5.1.1 (For Project Fi ONLY) (LVY48H)', 
         '5.1.1 (LMY48W)', 
         '5.1.1 (LMY48X)', 
         '5.1.1 (LMY48Y)', 
         '5.1.1 (For T-Mobile ONLY) (LYZ28N)', 
         '5.1.1 (For Project Fi ONLY) (LVY48I)', 
         '6.0.0 (MRA58K)', 
         '6.0.0 (MRA58N)', 
         '6.0.0 (MRA58R)', 
         '6.0.0 (MRA58X)', 
         '6.0.1 (MMB29K)', 
         '6.0.1 (MMB29S)'], 
'"soju" for Nexus S (worldwide version, i9020t and i9023)': ['2.3.6 (GRK39F)', 
                   '4.0.4 (IMM76D)', 
                   '4.1.2 (JZO54K)'], 
'"sojua" for Nexus S (850MHz version, i9020a)': ['2.3.6 (GRK39F)', 
                '4.0.4 (IMM76D)', 
                '4.1.2 (JZO54K)'], 
'"sojuk" for Nexus S (Korea version, m200)': ['2.3.6 (GRK39F)', 
               '4.0.4 (IMM76D)', 
               '4.1.1 (JRO03E)'], 
'"sojus" for Nexus S 4G (d720)': ['2.3.7 (GWK74)', 
            '4.0.4 (IMM76D)', 
            '4.1.1 (JRO03R)'], 
'"takju" for Galaxy Nexus "maguro" (GSM/HSPA+) (with Google Wallet)': ['4.0.4 ' 
                     '(IMM76I)', 
                     '4.1.2 ' 
                     '(JZO54K)', 
                     '4.2.2 ' 
                     '(JDQ39)', 
                     '4.3 ' 
                     '(JWR66Y)'], 
'"tungsten" for Nexus Q': ['4.0.4 (IAN67K)'], 
'"volantis" for Nexus 9 (Wi-Fi)': ['5.0 (LRX21Q)', 
            '5.0 (LRX21R)', 
            '5.0.1 (LRX22C)', 
            '5.0.2 (LRX22L)', 
            '5.1.1 (LMY47X)', 
            '5.1.1 (LMY48I)', 
            '5.1.1 (LMY48M)', 
            '5.1.1 (LMY48T)', 
            '6.0.0 (MRA58K)', 
            '6.0.0 (MRA58N)', 
            '6.0.1 (MMB29K)', 
            '6.0.1 (MMB29S)'], 
'"volantisg" for Nexus 9 (LTE)': ['5.0.1 (LRX22C)', 
            '5.0.2 (LRX22L)', 
            '5.1.1 (LMY47X)', 
            '5.1.1 (LMY48I)', 
            '5.1.1 (LMY48M)', 
            '5.1.1 (LMY48T)', 
            '5.1.1 (LMY48X)', 
            '5.1.1 (LMY48Z)', 
            '5.1.1 (LMY49F)', 
            '6.0.0 (MRA58K)', 
            '6.0.0 (MRA58N)', 
            '6.0.1 (MMB29K)', 
            '6.0.1 (MMB29S)'], 
'"yakju" for Galaxy Nexus "maguro" (GSM/HSPA+)': ['4.0.4 (IMM76I)', 
                '4.1.2 (JZO54K)', 
                '4.2.2 (JDQ39)', 
                '4.3 (JWR66Y)']} 
+0

这是我跟去了,虽然我调整了一下我的代码。谢谢! – vaindil

1

下生成包含从每个设备的最后一个条目列表。要做到这一点,你仍然需要通过的所有项目进行迭代,但后来只保留最后一个条目,如下所示:

from bs4 import BeautifulSoup  
import requests 


html = requests.get("https://developers.google.com/android/nexus/images") 
soup = BeautifulSoup(html.text, "lxml") 
models = [] 

for h2 in soup.find_all('h2', id=True)[1:]: 
    tr = h2.find_next('table').find_all('tr', id=True)[-1] 
    td = [t.text.strip() for t in tr.find_all('td')] 
    models.append([h2.text] + td) 

for device, version, link, cs1, cs2 in models: 
    print '{}, {}'.format(device, version) 

这显示以下内容:

"ryu" for Pixel C, 6.0.1 (MXB48K) 
"angler" for Nexus 6P, 6.0.1 (MMB29P) 
"bullhead" for Nexus 5X, 6.0.1 (MMB29P) 
"shamu" for Nexus 6, 6.0.1 (MMB29S) 
"fugu" for Nexus Player, 6.0.1 (MMB29T) 
"volantisg" for Nexus 9 (LTE), 6.0.1 (MMB29S) 
"volantis" for Nexus 9 (Wi-Fi), 6.0.1 (MMB29S) 
"hammerhead" for Nexus 5 (GSM/LTE), 6.0.1 (MMB29S) 
"razor" for Nexus 7 [2013] (Wi-Fi), 6.0.1 (MMB29O) 
"razorg" for Nexus 7 [2013] (Mobile), 6.0.1 (MMB29O) 
"mantaray" for Nexus 10, 5.1.1 (LMY49F) 
"occam" for Nexus 4, 5.1.1 (LMY48T) 
"nakasi" for Nexus 7 (Wi-Fi), 5.1.1 (LMY47V) 
"nakasig" for Nexus 7 (Mobile), 5.1.1 (LMY47V) 
"tungsten" for Nexus Q, 4.0.4 (IAN67K) 
"takju" for Galaxy Nexus "maguro" (GSM/HSPA+) (with Google Wallet), 4.3 (JWR66Y) 
"yakju" for Galaxy Nexus "maguro" (GSM/HSPA+), 4.3 (JWR66Y) 
"mysid" for Galaxy Nexus "toro" (Verizon CDMA/LTE), 4.2.2 (JDQ39) 
"mysidspr" for Galaxy Nexus "toroplus" (Sprint CDMA/LTE), 4.2.1 (GA02) 
"soju" for Nexus S (worldwide version, i9020t and i9023), 4.1.2 (JZO54K) 
"sojua" for Nexus S (850MHz version, i9020a), 4.1.2 (JZO54K) 
"sojuk" for Nexus S (Korea version, m200), 4.1.1 (JRO03E) 
"sojus" for Nexus S 4G (d720), 4.1.1 (JRO03R)