我有多个XML文件,我只想从某个级别提取某些部分并将这些值存储在data.frame
中。该级别始终被称为相同,即“发票”。将值从XML提取到R中的数据框中
我想提取“发票”级别的数据。所有这个级别的孩子都应该是一个排队实体。对于每个行实体,应提取value
,confidence
和zone
。
唯一的问题是,对于每个文件,实体的数量会有所不同。
的data.frame
应该是这样的:
Doc. Nr. Entity Value Zone Confidence
doc1 OcrText Text example 19 101 941 2625 76
doc1 InvoiceDate 17/06/2016 105 8 862 1555 100
doc1 InvoiceDate__day 17 105 8 862 1555 100
随着包rvest
和XML
的帮助下,我能提取zone
。
read_xml(xmlfile) %>% xml_nodes("Invoice") %>% xml_nodes("zone") %>% xml_text()
但我不能够提取级“发票”的孩子们的value
,confidence
和所有的名字。
这是XML文件的例子:
<?xml version="1.0" encoding="utf-8"?>
<DOKuStar baseType="documentType" state="Ok" confidence="0" version="2.0">
<Invoice baseType="documentType" state="Ok" confidence="0" producer="DOKuStar">
<sources>
<image guid=" fec8" />
</sources>
<OcrText baseType="fieldType" state="Reject" confidence="76">
<value> Text example
</value>
<zone>19 101 941 2625</zone>
<sources>
<image guid=" fec8" />
</sources>
</OcrText>
<InvoiceDate baseType="fieldType" state="Empty" confidence="100" class="dateType">
<value>17-06-2016
</value>
<zone>105 8 862 1555</zone>
<sources>
<image guid=" fec8" />
</sources>
</InvoiceDate>
<annotations>
<annotation key="FileOutputPath">E:\..\Outgoing\</annotation>
</annotations>
<InvoiceDate__day baseType="fieldType" state="Empty" confidence="100">
<value>17
</value>
<zone>105 8 862 1555</zone>
<sources>
<image guid=" fec8" />
</sources>
</InvoiceDate__day>
<InvoiceDate__month baseType="fieldType" state="Empty" confidence="100">
<value>06
</value>
<zone>105 8 862 1555</zone>
<sources>
<image guid=" fec8" />
</sources>
</InvoiceDate__month>
<InvoiceDate__year baseType="fieldType" state="Empty" confidence="100">
<value>2016
</value>
<zone>105 8 862 1555</zone>
<sources>
<image guid=" fec8" />
</sources>
</InvoiceDate__year>
<InvoiceNumber baseType="fieldType" state="Empty" confidence="100">
<value>12365
</value>
<zone>105 80 862 1555</zone>
<sources>
<image guid=" fec8" />
</sources>
</InvoiceNumber>
<InvoiceTotalsTotalAmount baseType="fieldType" state="Ok" confidence="87">
<value>21.98</value>
<zone>595 2062 77 34</zone>
<sources>
<image guid=" fec8" />
</sources>
</InvoiceTotalsTotalAmount>
<InvoiceTotalsNetAmount baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</InvoiceTotalsNetAmount>
<InvoiceTotalsVatAmount baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</InvoiceTotalsVatAmount>
<InvoiceTotalsCurrency baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</InvoiceTotalsCurrency>
<InvoiceTotals baseType="tableType" state="Ok" confidence="87">
<value>21.98 </value>
<zone>595 2062 77 34</zone>
<sources>
<image guid=" fec8" />
</sources>
<row baseType="tableRowType" state="Ok" confidence="0">
<TotalAmount baseType="fieldType" state="Ok" confidence="100">
<value>3.10</value>
<zone>596 2029 63 30</zone>
<sources>
<image guid=" fec8" />
</sources>
</TotalAmount>
<NetAmount baseType="fieldType" state="Ok" confidence="69">
<value>2.56</value>
<zone>287 2031 64 31</zone>
<sources>
<image guid=" fec8" />
</sources>
</NetAmount>
<VatAmount baseType="fieldType" state="Ok" confidence="78">
<value>0.54</value>
<zone>444 2030 59 31</zone>
<sources>
<image guid=" fec8" />
</sources>
</VatAmount>
<VatRate baseType="fieldType" state="Ok" confidence="83">
<value>21.00</value>
<zone>141 2035 30 26</zone>
<sources>
<image guid=" fec8" />
</sources>
</VatRate>
<Currency baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Currency>
<Type baseType="fieldType" state="Ok" confidence="0">
<value>Vat</value>
</Type>
</row>
<row baseType="tableRowType" state="Ok" confidence="0">
<TotalAmount baseType="fieldType" state="Ok" confidence="56">
<value>18.88</value>
<zone>603 1993 73 33</zone>
<sources>
<image guid=" fec8" />
</sources>
</TotalAmount>
<NetAmount baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</NetAmount>
<VatAmount baseType="fieldType" state="Ok" confidence="57">
<value>2.99</value>
<zone>653 1311 62 33</zone>
<sources>
<image guid=" fec8" />
</sources>
</VatAmount>
<VatRate baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</VatRate>
<Currency baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Currency>
<Type baseType="fieldType" state="Ok" confidence="0">
<value>Vat</value>
</Type>
</row>
</InvoiceTotals>
<Address baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Address>
<Address__firstname baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Address__firstname>
<Address__lastname baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Address__lastname>
<Address__city baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Address__city>
<Address__cityline baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Address__cityline>
<Address__nameline baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Address__nameline>
<Address__streetline baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Address__streetline>
<Address__streetname baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Address__streetname>
<Address__streetnumber baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Address__streetnumber>
<Address__zipcode baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Address__zipcode>
<Postcode baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Postcode>
<BankAccountNumber baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>215 15 1 1</zone>
<sources>
<image guid=" fec8" />
</sources>
</BankAccountNumber>
<InvoiceAcceptgiroCode baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>215 15 1 1</zone>
<sources>
<image guid=" fec8" />
</sources>
</InvoiceAcceptgiroCode>
<Website baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</Website>
<EmailAddress baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</EmailAddress>
<BICCode baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>215 15 1 1</zone>
<sources>
<image guid=" fec8" />
</sources>
</BICCode>
<CoCNumber baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>215 15 1 1</zone>
<sources>
<image guid=" fec8" />
</sources>
</CoCNumber>
<DebtorNumber baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>215 15 1 1</zone>
<sources>
<image guid=" fec8" />
</sources>
</DebtorNumber>
<IBANCode baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</IBANCode>
<IsCreditNote baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>105 8 862 1555</zone>
<sources>
<image guid=" fec8" />
</sources>
</IsCreditNote>
<IsKvKInvoice baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</IsKvKInvoice>
<VATNumber baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>0 8 967 2974</zone>
<sources>
<image guid=" fec8" />
</sources>
</VATNumber>
<ScanFormAdministration baseType="fieldType" state="Empty" confidence="0">
<value>
</value>
<zone>215 15 1 1</zone>
<sources>
<image guid=" fec8" />
</sources>
</ScanFormAdministration>
</Invoice>
<sourceInstances>
</sourceInstances>
<annotations>
</annotations>
</DOKuStar>