2012-10-18 158 views
0

我有一个python模块,它已经写入我的下载并解析来自Google专利列表的数据。代码工作得很好,直到2005年之前我做了任何事情。除了如何运行模块,我不知道python。我如何解决它?使用python解析XML文件

我收到回溯是:

Traceback (most recent call last): 
    File "C:\Users\John\Desktop\FINAL BART ALL INFO-Magic Bullet.py", line 46, in <module> 
    assert xml_file is not None 
AssertionError 

这是我使用的代码:

#Ignore all this information 
import urllib2, os, zipfile 
from lxml import etree 
#------------------------------------------------------------------------------- 
#Ignore all this information 
def xmlSplitter(data,separator=lambda x: x.startswith('<?xml')): 
    buff = [] 
    for line in data: 
    if separator(line): 
     if buff: 
     yield ''.join(buff) 
     buff[:] = [] 
    buff.append(line) 
    yield ''.join(buff) 

def first(seq,default=None): 
    """Return the first item from sequence, seq or the default(None) value""" 
    for item in seq: 
    return item 
    return default 
#------------------------------------------------------------------------------- 
#This is where you change the internet source file- Use the file extensions from the sheet provided. 
datasrc = "http://storage.googleapis.com/patents/grant_full_text/2003/pg030107.zip" 
#http://commondatastorage.googleapis.com/patents/grant_full_text/2012/ipg120117.zip 
filename = datasrc.split('/')[-1] 
#------------------------------------------------------------------------------- 
#Ignore all this information 
if not os.path.exists(filename): 
    with open(filename,'wb') as file_write: 
    r = urllib2.urlopen(datasrc) 
    file_write.write(r.read()) 

zf = zipfile.ZipFile(filename) 
xml_file = first([ x for x in zf.namelist() if x.endswith('.xml')]) 
assert xml_file is not None 
#------------------------------------------------------------------------------- 
#output set your folder location here, keep double \\ between 
outFolder = "C:\\PatentFiles\\" 
outFilename = os.path.splitext(filename)[0] 
#------------------------------------------------------------------------------- 
#These outputs are the names of the files-Ignore all this information 
output = outFolder + outFilename + "_general.txt" 
output2 = outFolder + outFilename + "_USCL.txt" 
output3 = outFolder + outFilename + "_citation.txt" 
output4 = outFolder + outFilename + "_inventor.txt" 
#Open files 
outFile = open(output, "w") 
outFile2 = open(output2, "w") 
outFile3 = open(output3, "w") 
outFile4 = open(output4, "w") 
#write the headers 
outFile.write("Patent No.|GrantDate|Application Date|Number of Claims|Examiners|US Primary Main Classification|Assignee|Assignee Address City_State_Country|First Inventor|First Inventor Address City_State_Country| \n") 
outFile2.write("Patent No.|Primary|U.S Classification| \n") 
outFile3.write ("Patent No.|Citation|Citation Date|Who Cited This| \n") 
outFile4.write ("Patent No.|Inventor Last Name|First Name|City|State|Country|Nationality Country|Residence Country|\n") 
#------------------------------------------------------------------------------- 
#Here is the count- adjust this each time you run the program for the first time. 
#Run at 10 for the 1st run then 5500 afterward. 
count = 0 
for item in xmlSplitter(zf.open(xml_file)): 
    count += 1 
    #5500 
    if count > 10: break 
    doc = etree.XML(item) 
    #------------------------------------------------------------------------------- 
    #This is where the python starts parsing the infomation. 
    #This is the Start of the General Infomation file. 
    docID = "~".join(doc.xpath('//publication-reference/document-id/country/text()|//publication-reference/document-id/doc-number/text()')) 
    docID = docID.replace("D0","D") 
    docID = docID.replace("H000","H") 
    docID = docID.replace("PP0","PP") 
    docID = docID.replace("PP0","PP") 
    docID = docID.replace("RE0","RE") 
    docID = docID.replace("~0","~") 
    docID = docID.replace("US~","") 

    grantdate = first(doc.xpath('//publication-reference/document-id/date/text()')) 
    applicationdate = first(doc.xpath('//application-reference/document-id/date/text()')) 
    claimsNum = first(doc.xpath('//number-of-claims/text()')) 

    assignee1 = "-".join(doc.xpath('//assignees/assignee/addressbook/orgname/text()|//assignees/assignee/addressbook/last-name/text()|//assignees/assignee/addressbook/first-name/text()')) 
    assignee1 = assignee1.replace('-',', ') 
    assignee2 = "_".join(doc.xpath('//assignee/addressbook/address/*/text()')) 
    assignees = str(assignee1.encode("UTF-8")) + "|" + str(assignee2.encode("UTF-8")) 

    inventors1 = first(doc.xpath('//applicants/applicant/addressbook/last-name/text()')) 
    inventor2 = first(doc.xpath('//applicants/applicant/addressbook/first-name/text()')) 
    inventor3 = first(doc.xpath('//applicants/applicant/addressbook/address/city/text()')) 
    inventor4 = first(doc.xpath('//applicants/applicant/addressbook/address/state/text()')) 
    inventor5 = first(doc.xpath('//applicants/applicant/addressbook/address/country/text()')) 
    inventor = str(inventor2.encode("UTF-8") if inventor2 else inventor2) + " " + str(inventors1.encode("UTF-8") if inventors1 else inventors1) 
    inventors2 = str(inventor3.encode("UTF-8") if inventor3 else inventor3) + "_" + str(inventor4) + "_" + str(inventor5) 
    inventors = str(inventor) + "|" + str(inventors2) 

    examiners = "~".join(doc.xpath('//examiners/primary-examiner/first-name/text()|//examiners/primary-examiner/last-name/text()')) 
    examiners = examiners.replace("~",", ") 

    uscl1 = first(doc.xpath('//classification-national/main-classification/text()')) 

    #END FIRST TEXT FILE #------------------------------------------------------------------------------- 
    #This begings the USCL file 
    notprimary = first(doc.xpath('//publication-reference/document-id/country/text()')) 
    notprimary = notprimary.replace("US","0") 

    primary1 = first(doc.xpath('//publication-reference/document-id/country/text()')) 
    primary1 = primary1.replace("US","1") 

    uscl2 = "~".join(doc.xpath('//us-bibliographic-data-grant/classification-national/*/text()|//sequence-cwu/publication-reference/document-id/country/text()')) 
    #-------------------------NOTE-------------------------------------------------- 
    #--------------------------NOTE------------------------------------------------- 
    #-----------------------NOTE---------------------------------------------------- 
    #NOTE- RUN through count 10 then remove pound signs from two below 
    uscl2 = uscl2.replace("US~", str(primary1) + "|") 
    uscl2 = uscl2.replace("~", "|" + "\n" + str(docID) + "|" + str(notprimary) + "|") 
    uscl2 = uscl2.replace("US", "|") 

    #END SECOND TEXT FILE #------------------------------------------------------------------------------- 
    #Begin the Citation file 
    citation = '~'.join(doc.xpath('//publication-reference/document-id/country/text()|//references-cited/citation/patcit/document-id/country/text()|//references-cited/citation/patcit/document-id/doc-number/text()|//references-cited/citation/patcit/document-id/kind/text()|//references-cited/citation/patcit/document-id/date/text()|//references-cited/citation/category/text()')) 

    #Here is the start of the patent connectors- in the patents they exist at the end. They are replaced in this code to make pipes | for the final output 
    citation = citation.replace("~A~", "[email protected]") 
    citation = citation.replace("~S~", "[email protected]") 
    citation = citation.replace("~S1~", "[email protected]") 
    citation = citation.replace("~B1~", "[email protected]") 
    citation = citation.replace("~B2~", "[email protected]") 
    citation = citation.replace("~A1~", "[email protected]") 
    citation = citation.replace("~H~", "[email protected]") 
    citation = citation.replace("~E~", "[email protected]") 


    #citation = citation.replace("~QQ~", "[email protected]") 

    #make unique citation changes here-for example when "US" or "DE" in imbeded in citation see below 
    citation = citation.replace("05225US~", "05225U$|") 
    citation = citation.replace("063106 DE", "063106D!") 
    citation = citation.replace("US~US~", "US~") 
    citation = citation.replace("PCT/US", "PCT/U$") 
    citation = citation.replace("PCTUS", "PCTU$") 
    citation = citation.replace("WO US", "WO U$") 
    citation = citation.replace("WO~US", "WO~ U$") 

    #fixes for cites without pipes-see below -DONT TOUCH THESE 
    citation = citation.replace("US~cited by examiner", "||cited by examiner") 
    citation = citation.replace("US~cited by other", "||cited by other") 


    #Here are the changes to return each citation into a unique row 
    #If a country is only listed in the columns in Excel they need a fix like this, If KR is alone then use the code:::: citation = citation.replace("KR~", "Foreign -KR-") 
    citation = citation.replace("[email protected]", "|") 
    citation = citation.replace("~US~", "|" + "\n" + str(docID) +"|") 
    citation = citation.replace("US~", "") 
    citation = citation.replace("~JP~", "|" + "\n" + str(docID) +"|"+ "Foreign -JP-") 
    citation = citation.replace("JP~", "Foreign -JP-") 
    citation = citation.replace("~GB~", "|" + "\n" + str(docID) +"|"+ "Foreign -GB-") 
    citation = citation.replace("GB~", "Foreign -GB-") 
    citation = citation.replace("~WO~", "|" + "\n" + str(docID) +"|"+ "Foreign -WO-") 
    citation = citation.replace("WO~", "Foreign -WO-") 
    citation = citation.replace("~CA~", "|" + "\n" + str(docID) +"|"+ "Foreign -CA-") 
    citation = citation.replace("~DE~EP~", "~DE~ EP-") 
    citation = citation.replace("~DE~", "|" + "\n" + str(docID) +"|"+ "Foreign -DE-") 
    citation = citation.replace("DE~", "Foreign -DE-") 
    citation = citation.replace("~KR~", "|" + "\n" + str(docID) +"|"+ "Foreign -KR-") 
    citation = citation.replace("KR~", "Foreign -KR-") 
    citation = citation.replace("~EM~", "|" + "\n" + str(docID) +"|"+ "Foreign -EM-") 
    citation = citation.replace("~CH~", "|" + "\n" + str(docID) +"|"+ "Foreign -CH-") 
    citation = citation.replace("~DE~", "|" + "\n" + str(docID) +"|"+ "Foreign -DE-") 
    citation = citation.replace("~SE~", "|" + "\n" + str(docID) +"|"+ "Foreign -SE-") 
    citation = citation.replace("~FR~", "|" + "\n" + str(docID) +"|"+ "Foreign -FR-") 
    citation = citation.replace("~FR~EP~", "~FR~ EP-") 
    citation = citation.replace("FR~", "Foreign -FR-") 
    citation = citation.replace("~CN~", "|" + "\n" + str(docID) +"|"+ "Foreign -CN-") 
    citation = citation.replace("~TW~", "|" + "\n" + str(docID) +"|"+ "Foreign -TW-") 
    citation = citation.replace("~TW", "|" + "\n" + str(docID) +"|"+ "Foreign -TW-") 
    citation = citation.replace("TW~", "Foreign -TW-") 
    citation = citation.replace("~NL~", "|" + "\n" + str(docID) +"|"+ "Foreign -NL-") 
    citation = citation.replace("~BR~", "|" + "\n" + str(docID) +"|"+ "Foreign -BR-") 
    citation = citation.replace("~AU~", "|" + "\n" + str(docID) +"|"+ "Foreign -AU-") 
    citation = citation.replace("~ES~", "|" + "\n" + str(docID) +"|"+ "Foreign -ES-") 
    citation = citation.replace("~IT~", "|" + "\n" + str(docID) +"|"+ "Foreign -IT-") 
    citation = citation.replace("~SU~", "|" + "\n" + str(docID) +"|"+ "Foreign -SU-") 
    citation = citation.replace("~AT~", "|" + "\n" + str(docID) +"|"+ "Foreign -AT-") 
    citation = citation.replace("~BE~", "|" + "\n" + str(docID) +"|"+ "Foreign -BE-") 
    citation = citation.replace("~DK~", "|" + "\n" + str(docID) +"|"+ "Foreign -DK-") 
    citation = citation.replace("~RU~", "|" + "\n" + str(docID) +"|"+ "Foreign -RU-") 
    citation = citation.replace("RU~", "Foreign -RU-") 


    #citation = citation.replace("~QQ~", "|" + "\n" + str(docID) +"|"+ "Foreign -QQ-") 

    #These are just end of citation fixes-DONT TOUCH THESE 
    citation = citation.replace("cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other~cited by other", "cited by other") 
    citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner") 
    citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other") 
    citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other") 
    citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner") 
    citation = citation.replace("cited by other~cited by other", "cited by other") 
    citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner") 
    citation = citation.replace("cited by other~cited by examiner", "cited by other") 
    citation = citation.replace("cited by examiner~cited by other", "cited by examiner") 
    citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner") 
    citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other") 
    citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other") 
    citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner") 
    citation = citation.replace("cited by other~cited by other", "cited by other") 
    citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner") 
    citation = citation.replace("cited by other~cited by examiner", "cited by other") 
    citation = citation.replace("cited by examiner~cited by other", "cited by examiner") 
    citation = citation.replace("cited by examiner~cited by other~cited by other", "cited by examiner") 
    citation = citation.replace("cited by other~cited by examiner~cited by examiner", "cited by other") 
    citation = citation.replace("cited by other~cited by other~cited by other~cited by other", "cited by other") 
    citation = citation.replace("cited by examiner~cited by examiner~cited by examiner~cited by examiner", "cited by examiner") 
    citation = citation.replace("cited by other~cited by other", "cited by other") 
    citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner") 
    citation = citation.replace("cited by other~cited by examiner", "cited by other") 
    citation = citation.replace("cited by examiner~cited by other", "cited by examiner") 
    citation = citation.replace("cited by other~cited by other", "cited by other") 
    citation = citation.replace("cited by examiner~cited by examiner", "cited by examiner") 
    citation = citation.replace("cited by other~cited by examiner", "cited by other") 
    citation = citation.replace("cited by examiner~cited by other", "cited by examiner") 

    citation = citation.replace("~", "|") 

    citation = citation.replace("US", "||") 

    #make unique post-processing citation changes here-If needed for the end of the scripts 
    citation = citation.replace("CA|", "Foreign -CA-") 
    citation = citation.replace("EP|", "Foreign -EP-") 
    citation = citation.replace("CN|", "Foreign -CN-") 
    citation = citation.replace("$", "S") 
    citation = citation.replace("D!", "DE") 

    #citation = citation.replace(" ", " ") 

    #END CITATION FILE------------------------------------------------------------------------------- 

    #START the inventors file 
    inventor1 = doc.xpath('//applicants/applicant/addressbook/last-name/text()|//applicants/applicant/addressbook/first-name/text()|//applicants/applicant/addressbook/address/city/text()|//applicants/applicant/addressbook/address/state/text()|//applicants/applicant/addressbook/address/country/text()|//applicants/applicant/nationality/*/text()|//applicants/applicant/residence/*/text()|//sequence-cwu/publication-reference/document-id/country/text()|//sequence-cwu/number/text()') 
    inventor1 = '~'.join(inventor1).replace('\n-','') 

    #For files after 2009 use this to replace State errors in the Excel- If the output is short then use this to add in a None value for State 
    inventor1 = inventor1.replace('~KR~omitted','~None~KR~omitted') 
    inventor1 = inventor1.replace('~GB~omitted','~None~GB~omitted') 
    inventor1 = inventor1.replace('~IT~omitted','~None~IT~omitted') 
    inventor1 = inventor1.replace('~JP~omitted','~None~JP~omitted') 
    inventor1 = inventor1.replace('~FR~omitted','~None~FR~omitted') 
    inventor1 = inventor1.replace('~BR~omitted','~None~BR~omitted') 
    inventor1 = inventor1.replace('~NO~omitted','~None~NO~omitted') 
    inventor1 = inventor1.replace('~HK~omitted','~None~HK~omitted') 
    inventor1 = inventor1.replace('~CA~omitted','~None~CA~omitted') 
    inventor1 = inventor1.replace('~TW~omitted','~None~TW~omitted') 
    inventor1 = inventor1.replace('~SE~omitted','~None~SE~omitted') 
    inventor1 = inventor1.replace('~CH~omitted','~None~CH~omitted') 
    inventor1 = inventor1.replace('~DE~omitted','~None~DE~omitted') 
    inventor1 = inventor1.replace('~SG~omitted','~None~SG~omitted') 
    inventor1 = inventor1.replace('~IN~omitted','~None~IN~omitted') 
    inventor1 = inventor1.replace('~IL~omitted','~None~IL~omitted') 
    inventor1 = inventor1.replace('~CN~omitted','~None~CN~omitted') 
    inventor1 = inventor1.replace('~FI~omitted','~None~FI~omitted') 
    inventor1 = inventor1.replace('~ZA~omitted','~None~ZA~omitted') 
    inventor1 = inventor1.replace('~NL~omitted','~None~NL~omitted') 
    inventor1 = inventor1.replace('~AT~omitted','~None~AT~omitted') 
    inventor1 = inventor1.replace('~AU~omitted','~None~AU~omitted') 
    inventor1 = inventor1.replace('~BE~omitted','~None~BE~omitted') 
    inventor1 = inventor1.replace('~CZ~omitted','~None~CZ~omitted') 
    inventor1 = inventor1.replace('~RU~omitted','~None~RU~omitted') 
    inventor1 = inventor1.replace('~IE~omitted','~None~IE~omitted') 
    inventor1 = inventor1.replace('~AR~omitted','~None~AR~omitted') 
    inventor1 = inventor1.replace('~MY~omitted','~None~MY~omitted') 
    inventor1 = inventor1.replace('~SK~omitted','~None~SK~omitted') 
    inventor1 = inventor1.replace('~ES~omitted','~None~ES~omitted') 
    inventor1 = inventor1.replace('~NZ~omitted','~None~NZ~omitted') 
    inventor1 = inventor1.replace('~HU~omitted','~None~HU~omitted') 
    inventor1 = inventor1.replace('~UA~omitted','~None~UA~omitted') 
    inventor1 = inventor1.replace('~DK~omitted','~None~DK~omitted') 
    inventor1 = inventor1.replace('~TH~omitted','~None~TH~omitted') 
    inventor1 = inventor1.replace('~MX~omitted','~None~MX~omitted') 


    #inventor1 = inventor1.replace('~QQ~omitted','~None~QQ~omitted') 

    #For the 2005-2008 files use these lines 

    inventor1 = inventor1.replace('~NO~NO~NO','~None~NO~NO~NO') 
    inventor1 = inventor1.replace('~NZ~NZ~NZ','~None~NZ~NZ~NZ') 
    inventor1 = inventor1.replace('~RU~RU~RU','~None~RU~RU~RU') 
    inventor1 = inventor1.replace('~RO~RO~RO','~None~RO~RO~RO') 
    inventor1 = inventor1.replace('~SE~SE~SE','~None~SE~SE~SE') 
    inventor1 = inventor1.replace('~SG~SG~SG','~None~SG~SG~SG') 
    inventor1 = inventor1.replace('~SI~SI~SI','~None~SI~SI~SI') 
    inventor1 = inventor1.replace('~TH~TH~TH','~None~TH~TH~TH') 
    inventor1 = inventor1.replace('~TR~TR~TR','~None~TR~TR~TR') 
    inventor1 = inventor1.replace('~TW~TW~TW','~None~TW~TW~TW') 
    inventor1 = inventor1.replace('~VE~VE~VE','~None~VE~VE~VE') 
    inventor1 = inventor1.replace('~ZA~ZA~ZA','~None~ZA~ZA~ZA') 
    inventor1 = inventor1.replace('~AN~AN~AN','~None~AN~AN~AN') 
    inventor1 = inventor1.replace('~AR~AR~AR','~None~AR~AR~AR') 
    inventor1 = inventor1.replace('~BA~BA~BA','~None~BA~BA~BA') 
    inventor1 = inventor1.replace('~PH~PH~PH','~None~PH~PH~PH') 
    inventor1 = inventor1.replace('~HR~HR~HR','~None~HR~HR~HR') 
    inventor1 = inventor1.replace('~LT~LT~LT','~None~LT~LT~LT') 
    inventor1 = inventor1.replace('~EE~EE~EE','~None~EE~EE~EE') 
    inventor1 = inventor1.replace('~BJ~BJ~BJ','~None~BJ~BJ~BJ') 
    inventor1 = inventor1.replace('~CR~CR~CR','~None~CR~CR~CR') 
    inventor1 = inventor1.replace('~PL~PL~PL','~None~PL~PL~PL') 
    inventor1 = inventor1.replace('~CO~CO~CO','~None~CO~CO~CO') 
    inventor1 = inventor1.replace('~UA~UA~UA','~None~UA~UA~UA') 
    inventor1 = inventor1.replace('~KW~KW~KW','~None~KW~KW~KW') 
    inventor1 = inventor1.replace('~CL~CL~CL','~None~CL~CL~CL') 
    inventor1 = inventor1.replace('~CY~CY~CY','~None~CY~CY~CY') 
    inventor1 = inventor1.replace('~LI~LI~LI','~None~LI~LI~LI') 
    inventor1 = inventor1.replace('~SA~SA~SA','~None~SA~SA~SA') 

    #inventor1 = inventor1.replace('~QQ~QQ~QQ','~None~QQ~QQ~QQ') 

    #For lines that don't return use these lines in the code for 2009- 
    inventor1 = inventor1.replace('omitted~US~','omitted~US' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~FR~','omitted~FR' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~DK~','omitted~DK' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~KR~','omitted~KR' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~JP~','omitted~JP' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~GB~','omitted~GB' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~IT~','omitted~IT' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~CH~','omitted~CH' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~SG~','omitted~SG' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~DE~','omitted~DE' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~IN~','omitted~IN' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~TW~','omitted~TW' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('omitted~CN~','omitted~CN' +"|"+ '\n' + str(docID) +"|") 


    #inventor1 = inventor1.replace('omitted~QQ~','omitted~QQ' +"|"+ '\n' + str(docID) +"|") 

    #for lines 2005-2008 use this line for returning countries 
    inventor1 = inventor1.replace('AT~AT~AT~','AT~AT~AT' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('AN~AN~AN~','AN~AN~AN' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('AR~AR~AR~','AR~AR~AR' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('AU~AU~AU~','AU~AU~AU' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('AZ~AZ~AZ~','AZ~AZ~AZ' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('BA~BA~BA~','BA~BA~BA' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('BE~BE~BE~','BE~BE~BE' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('BR~BR~BR~','BR~BR~BR' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('BS~BS~BS~','BS~BS~BS' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('CA~CA~CA~','CA~CA~CA' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('CH~CH~CH~','CH~CH~CH' +"|"+ '\n' + str(docID) +"|") 
    inventor1 = inventor1.replace('CN~CN~CN~','CN~CN~CN' +"|"+ '\n' + str(docID) +"|") 


    #inventor1 = inventor1.replace('QQ~QQ~QQ~','QQ~QQ~QQ' +"|"+ '\n' + str(docID) +"|") 

    #special case fixes- these are for strange names fixes in the code that may not create the correct amount of columns. 
    inventor1 = inventor1.replace('~None~None~NO~','~None~NO~') 
    inventor1 = inventor1.replace('Ramandeep~Chandigarh','Ramandeep|None~Chandigarh') 
    inventor1 = inventor1.replace('Esk~eh~r','Eskehr') 
    inventor1 = inventor1.replace('Baychar~Eastport','Baychar~None~Eastport') 

    inventor1 = inventor1.replace('US~1', '||||||') 
    inventor1 = inventor1.replace('~','|') 

    #End the inventor file 
    #------------------------------------------------------------------------------- 

    #Here are the output print fields- you can change one if you want but remember to comment out all but the one you wish to view. 
    print "DocID: {0}\nGrantDate: {1}\nApplicationDate: {2}\nNumber of Claims: {3}\nExaminers: {4}\nAssignee: {5}\nInventor: {6}\nUS Cl.: {7}\n".format(docID,grantdate,applicationdate,claimsNum,examiners.encode("UTF-8"),assignees,inventors,uscl1) 
    #print "DocID: {0}\nU.S Cl: {1}\nPrimary: {2}\n".format(docID,uscl2,primary1) 
    #print "DocID: {0}\nCitation: {1}\n".format(docID,citation.encode("UTF-8")) 
    #print "DocID: {0}\nTitle: {1}\nInventors: {2}\n".format(docID,appID,inventor1.encode("UTF-8")) 

    #------------------------------------------------------------------------------- IGNORE Everything else below this. 
    #Output first general info bits 
    outFile.write(str(docID) +"|"+ str(grantdate) +"|"+ str(applicationdate) + "|"+ str(claimsNum) + "|"+ str(examiners.encode("UTF-8")) + "|"+ str(uscl1) + "|"+ str(assignees) + "|"+ str(inventors) +"|"+"\n") 

    #Output Classifications only 
    outFile2.write(str(docID) +"|"+ str(uscl2) +"|"+ "\n") 

    #Output Citations only 
    outFile3.write(str(docID) +"|"+ str(citation) +"|"+"\n") 

    #Output inventors only 
    outFile4.write(str(docID) + "|"+ str(inventor1.encode("UTF-8")) + "|" +"\n") 


outFile.close() 
outFile2.close() 
outFile3.close() 
outFile4.close() 
print "output files complete" 
+4

你需要给我们一个关于你得到什么错误等的更多上下文。此外,这听起来像是一个“请为我做”的问题。 SO上的这些问题通常会很快关闭。 – inspectorG4dget

+0

我得到的错误是 >>> 回溯(最近通话最后一个): 文件 “C:\用户\约翰\桌面\ FINAL BART所有的信息魔法Bullet.py” 46行,在 断言xml_file不是无 AssertionError >>> –

+0

请将完整的错误贴在您的问题文章中。如果没有正确的格式,阅读错误消息是相当困难的,评论中就是这种情况 – inspectorG4dget

回答

1

你所看到的问题不是一个Python的问题。代码解压zip文件,并期望在里面找到一个xml文件。 assert语句是一个chack语句,以确保找到一个xml文件。它的目的是暂停你的程序,如果它没有找到一个XML文件。如果你下载了分配给datasrc的zip文件,你会发现一个空的zip文件。当它试图找到xml文件时,它找不到一个,所以xml_file = None。然后,当它到达assert语句时,它会引发声明错误。

你可能会拿出assert并运行代码就好了,但是当程序崩溃时,你将不知道为什么。让它在那里为您提供一种方便的方式来捕捉失败的时间,地点和原因。