2015-08-15 32 views
1

以下是我想从中提取“名称”和“event_place”的页面的一部分html代码。但是,我从来没有见过数据塞进这种复杂的方式。在标签中,有'var person',其中的名字出现在“personBestName”下,即'John Stuart'。从BeautifulSoup中提取数据<script>和var

同样,对于“event_place”,这是根据“变种人” ......等事件发生的实体应该是“B,汉密尔顿(城市/引文),安大略省,加拿大”

<script> 

    var person = {"id":"p_14062397399","links":{"record":{"href":"https://familysearch.org/platform/records/records/9MFX-7VLY"},"persona":{"href":"https://familysearch.org/platform/records/personas/KH21-F11"}},"extracted":true,"identifiers":{"http://gedcomx.org/Persistent":["https://familysearch.org/ark:/61903/1:1:KH21-F11"],"$":["https://familysearch.org/platform/externalId/easy/1001080442645"]},"principal":true,"gender":{"type":"http://gedcomx.org/Male","fields":[{"type":"http://gedcomx.org/Gender","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_SEX_CODE","text":"Male","resource":"http://gedcomx.org/Male"}]}]},"names":[{"type":"http://gedcomx.org/BirthName","nameForms":[{"fullText":"John Stuart","parts":[{"type":"http://gedcomx.org/Given","value":"John","fields":[{"type":"http://gedcomx.org/Given","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NAME_GN","text":"John"}]}]},{"type":"http://gedcomx.org/Surname","value":"Stuart","fields":[{"type":"http://gedcomx.org/Surname","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NAME_SURN","text":"Stuart"}]}]}],"fields":[{"type":"http://gedcomx.org/Name","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NAME","text":"John Stuart"}]}]}]}],"facts":[{"type":"http://gedcomx.org/MaritalStatus","value":"Single","fields":[{"type":"http://gedcomx.org/MaritalStatus","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_MARITAL_STATUS","text":"Single"}]}]},{"type":"http://gedcomx.org/Religion","value":"Presbyterian","fields":[{"type":"http://gedcomx.org/Religion","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_RELIGION","text":"Presbyterian"}]}]},{"type":"http://gedcomx.org/Nationality","value":"Canadian","fields":[{"type":"http://gedcomx.org/Nationality","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_NATIONALITY","text":"Canadian"}]}]},{"type":"http://gedcomx.org/Census","date":{"original":"31 Mar 1901","fields":[{"type":"http://gedcomx.org/Date","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"EVENT_DATE","text":"31 Mar 1901"}]},{"type":"http://gedcomx.org/Year","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"EVENT_YEAR","text":"1901"}]}]},"place":{"original":"B, Hamilton (city/cité), Ontario, Canada","fields":[{"type":"http://gedcomx.org/Place","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"EVENT_PLACE","text":"B, Hamilton (city/cité), Ontario, Canada"}]}]},"primary":true},{"type":"http://gedcomx.org/Birth","date":{"original":"1831","fields":[{"type":"http://gedcomx.org/Year","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_BIRTH_YEAR_ESTIMATED","text":"1831"}]}]},"place":{"original":"Scotland","fields":[{"type":"http://gedcomx.org/Place","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_BIRTH_PLACE","text":"Scotland"}]}]}},{"type":"http://gedcomx.org/Immigration","date":{"original":"1848","fields":[{"type":"http://gedcomx.org/Year","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_IMMIGRATION_YEAR","text":"1848"}]}]}}],"fields":[{"type":"http://gedcomx.org/Age","values":[{"type":"http://gedcomx.org/Original","labelId":"PR_AGE_ORIG","text":"70"}]},{"type":"http://familysearch.org/types/fields/UniqueIdentifier","values":[{"type":"http://gedcomx.org/Original","labelId":"UNIQUE_IDENTIFIER","text":"1001080442645"}]},{"type":"http://familysearch.org/types/fields/HouseholdId","values":[{"type":"http://gedcomx.org/Original","labelId":"HOUSEHOLD_ID","text":"66"}]},{"type":"http://gedcomx.org/RelationshipToHead","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"PR_RELATIONSHIP_TO_HEAD","text":"Head"}]},{"type":"http://familysearch.org/types/fields/RelationshipToHeadCode","values":[{"type":"http://gedcomx.org/Original","labelId":"RELATIONSHIP_CODE","text":"SELF"}]},{"type":"http://familysearch.org/types/fields/CollectionId","values":[{"type":"http://gedcomx.org/Original","labelId":"COLLECTION_ID","text":"1584557"}]},{"type":"http://familysearch.org/types/fields/EventDistrict","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_DISTRICT","text":"Hamilton (city/cité)"}]},{"type":"http://familysearch.org/types/fields/EventProvince","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_PROVINCE","text":"Ontario"}]},{"type":"http://familysearch.org/types/fields/EventSubDistrict","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_SUB_DISTRICT","text":"B"}]},{"type":"http://familysearch.org/types/fields/EventType","values":[{"type":"http://gedcomx.org/Original","labelId":"EVENT_TYPE","text":"Census"}]},{"type":"http://familysearch.org/types/fields/Id","values":[{"type":"http://gedcomx.org/Original","labelId":"ID","text":"z002-z000067618"}]},{"type":"http://familysearch.org/types/fields/Page","values":[{"type":"http://gedcomx.org/Original","labelId":"PAGE","text":"8"}]},{"type":"http://familysearch.org/types/fields/Pid","values":[{"type":"http://gedcomx.org/Original","labelId":"PID","text":"11335440"}]},{"type":"http://familysearch.org/types/fields/PpqId","values":[{"type":"http://gedcomx.org/Original","labelId":"PPQ_ID","text":"08-0278"}]},{"type":"http://familysearch.org/types/fields/PrAgeInYears","values":[{"type":"http://gedcomx.org/Original","labelId":"PR_AGE_IN_YEARS","text":"70"}]},{"type":"http://familysearch.org/types/fields/PrRacialOrTribalOrigin","values":[{"type":"http://gedcomx.org/Original","labelId":"PR_RACIAL_OR_TRIBAL_ORIGIN","text":"Scotch"}]},{"type":"http://familysearch.org/types/fields/RollNumber","values":[{"type":"http://gedcomx.org/Original","labelId":"ROLL_NUMBER","text":"CC1901_47"}]},{"type":"http://familysearch.org/types/fields/SortKey","values":[{"type":"http://gedcomx.org/Interpreted","labelId":"SORT_KEY","text":"z002-z000067618_0000066_11335440_1001080442645"}]}],"url":"https://familysearch.org/ark:/61903/1:1:KH21-F11","personBestName":"John Stuart","localizedGender":"Male","title":"John Stuart, \"Canada Census, 1901\"","personRecordTitle":"John Stuart","metadata":{"bibliographicCitation":"\"Canada Census, 1901,\" , <i>FamilySearch</i> (https://familysearch.org/ark:/61903/1:1:KH21-F11 : accessed 14 August 2015), John Stuart, B, Hamilton (city/cité), Ontario, Canada; citing p. 8, Library and Archives of Canada, Ottawa."},"imageMeta":{"thirdPartyHostName":"","isExternalImage":false,"thirdPartyURL":"","imageURL":"","wikiCollectionURL":"/learn/wiki/en/api.php?action=query&list=search&srwhat=text&format=json&srsearch=CID1584557"}}; 

我能够从中提取名称实体的另一部分(未示出)来自标签和指定类的html。

# coding=utf-8 
import urllib2 
import re 
import csv 
from bs4 import BeautifulSoup 
import time 
from unicodedata import normalize 
Url = "https://familysearch.org/pal:/MM9.1.1/KHR6-D6D" 
Page = urllib2.urlopen(Url) 
Soup = BeautifulSoup(Page) 
Page.close() 
x = Soup.find("h3", { "class" : "print-only print-title" }) 
sx = x.string.encode('utf-8') 
k = sx.split(', "Can') 
kk = k[0].split(' in household') 
name = kk[0] 
print name 

编辑:

# Get other fields 
rawJ = Soup.find_all('script') 
J = str(rawJ[10]) 
J1 = J.split('var person = ') 
J2 = J1[1].rsplit('var record =') 
J3 = J2[0].rsplit(';', 1) 

JsonText = J3[0] 
#print JsonText 

s = json.loads(JsonText) 
print s["personBestName"] 

# UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 5: ordinal not in range(128) 
+0

这就是JSON。您可以使用python中的'json'模块轻松读取。 – Rishav

回答

2

这一长串是JSON,大致映射到Python字典。您有键值对,例如分别为"id""p_14062397399"

因此,我美化了JSON here,您可以轻松看到键值对和嵌套结构。要提取名称和地址你会做:

from bs4 import BeautifulSoup as bs 
from urllib import urlopen 
import json 

Soup = bs(urlopen('https://familysearch.org/pal:/MM9.1.1/KHR6-D6D').read()) 

rawJ = Soup.find_all('script') 
J = str(rawJ[10]) 
J1 = J.split('var person = ') 
J2 = J1[1].rsplit('var record =') 
J3 = J2[0].rsplit(';', 1) 

JsonText = J3[0].decode('utf-8') 


s = json.loads(JsonText) 
print s["personBestName"] 
for i in s["facts"]: 
    if i["type"] == "http://gedcomx.org/Census": 
     print i["place"]["fields"][0]["values"][0]["text"] 
+0

谢谢,但我作为'什么'?我试过s = Soup.find_all('script')以及s =汤,都给了我错误。 – KubiK888

+0

谢谢,虽然我怎样才能从整个脚本中自动提取Json部分? aka从开始时删除'