2015-08-18 41 views
0

我想一个方法来解析XML文档不知道它的文档结构或硬编码的节点/元素名称返回我一个地图中键,值配对。不知道它解析XML文档的结构,在Java中

我目前使用解析STAX,目前执行工作的XML文档,但有趣的是它不解析整个文档。不知何故,它跳过数据。

通过查看XML文档和测试输出,你会发现不是所有的值都打印出来。我可能会错过什么?

代码:

public Map<String, String> p(File file) throws Exception { 

    Map<String, String> map = new HashMap<String,String>(); 
    XMLStreamReader xr = XMLInputFactory.newInstance().createXMLStreamReader(new FileInputStream(file)); 


    while(xr.hasNext()) { 

     int e = xr.next(); 
     if (e == XMLStreamReader.START_ELEMENT) { 
      String name = xr.getLocalName(); 
      xr.next(); 
      String value = null; 
      try { 
       value = xr.getText(); 
      } catch (IllegalStateException exep) { 
       exep.printStackTrace(); 
      } 
      map.put(name, value); 
     } 
    } 
    return map; 
} 



@Test 
public void test() throws Exception, FactoryConfigurationError, Exception { 
    File f = new File("xmlDir/request.xml"); 
    Map<String,String> map = p(f); 

    // Print all Key/Value pairs 
    for(Map.Entry<String, String> entry: map.entrySet()) { 
     String key = entry.getKey(); 
     String value = entry.getValue(); 
     logger.debug("Key: "+key); 
     logger.debug("Value: "+value); 
    } 

    Assert.assertEquals(map.get("MonthlyPlanPremiumAmtPP"), "136"); 
    Assert.assertEquals(map.get("MonthlyAdvancedPTCAmtPP"), "125"); 
    Assert.assertEquals(map.get("AdjustedGrossIncomeAmt"), "22000"); 
    Assert.assertEquals(map.get("TotalExemptionsCnt"), "1"); 
} 

输出:

2015-08-18 16:21:44,408 : Key: IRS1095A 
2015-08-18 16:21:44,409 : Value: 

2015-08-18 16:21:44,409 : Key: MonthlyAdvancedPTCAmtPP 
2015-08-18 16:21:44,409 : Value: 125 
2015-08-18 16:21:44,409 : Key: IndividualReturnFilingStatusCd 
2015-08-18 16:21:44,409 : Value: 1 
2015-08-18 16:21:44,409 : Key: IRS1040 
2015-08-18 16:21:44,409 : Value: 

2015-08-18 16:21:44,409 : Key: MonthlyPTCInformationGrpPP 
2015-08-18 16:21:44,409 : Value: 

2015-08-18 16:21:44,409 : Key: MonthlyPremiumSLCSPAmtPP 
2015-08-18 16:21:44,409 : Value: 250 
2015-08-18 16:21:44,409 : Key: Filer 
2015-08-18 16:21:44,409 : Value: 

2015-08-18 16:21:44,409 : Key: TotalPremiumSLCSPAmtPP 
2015-08-18 16:21:44,409 : Value: 3000 
2015-08-18 16:21:44,409 : Key: ResidentStateAbbreviationCdPP 
2015-08-18 16:21:44,409 : Value: CA 
2015-08-18 16:21:44,409 : Key: TotalPlanPremiumAmtPP 
2015-08-18 16:21:44,409 : Value: 1632 
2015-08-18 16:21:44,409 : Key: TotalExemptionsCnt 
2015-08-18 16:21:44,409 : Value: 1 
2015-08-18 16:21:44,409 : Key: TotalAdvancedPTCAmtPP 
2015-08-18 16:21:44,409 : Value: 1500 
2015-08-18 16:21:44,409 : Key: MonthlyPlanPremiumAmtPP 
2015-08-18 16:21:44,409 : Value: 136 
2015-08-18 16:21:44,409 : Key: RecipientSSNPP 
2015-08-18 16:21:44,409 : Value: 555-11-2222 
2015-08-18 16:21:44,409 : Key: WagesSalariesAndTipsAmt 
2015-08-18 16:21:44,409 : Value: 22000 
2015-08-18 16:21:44,409 : Key: MonthCdPP 
2015-08-18 16:21:44,409 : Value: NOVEMBER 
2015-08-18 16:21:44,409 : Key: ReturnData 
2015-08-18 16:21:44,409 : Value: 

2015-08-18 16:21:44,409 : Key: PrimaryResidentStatesInfoGrpPP 
2015-08-18 16:21:44,409 : Value: 

2015-08-18 16:21:44,409 : Key: SelfSelectPINGrp 
2015-08-18 16:21:44,409 : Value: 

2015-08-18 16:21:44,409 : Key: ResidentStateInfoPP 
2015-08-18 16:21:44,409 : Value: 

2015-08-18 16:21:44,409 : Key: Return 
2015-08-18 16:21:44,409 : Value: 

2015-08-18 16:21:44,409 : Key: PrimaryBirthDt 
2015-08-18 16:21:44,409 : Value: 1970-01-01 
2015-08-18 16:21:44,409 : Key: ReturnHeader 
2015-08-18 16:21:44,409 : Value: 

2015-08-18 16:21:44,409 : Key: AdjustedGrossIncomeAmt 
2015-08-18 16:21:44,409 : Value: 22000 
2015-08-18 16:21:44,409 : Key: PrimarySSN 
2015-08-18 16:21:44,409 : Value: 555-11-2222 

XML文档:request.xml

<Return xmlns="http://www.irs.gov/efile"> 
    <ReturnData> 
    <IRS1095A uuid="a77f40a2-af31-4404-a27d-4c1eaad730c2"> 
     <MonthlyPTCInformationGrpPP uuid="69dc9dd5-5415-4ee4-a199-19b2dbb701be"> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     <MonthCdPP>SEPTEMBER</MonthCdPP> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     </MonthlyPTCInformationGrpPP> 
     <MonthlyPTCInformationGrpPP uuid="8495fa61-0e7c-45e3-8f07-9765f4ef2fc3"> 
     <MonthCdPP>OCTOBER</MonthCdPP> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     </MonthlyPTCInformationGrpPP> 
     <MonthlyPTCInformationGrpPP uuid="7de1052f-6107-41da-aea4-e4495018fc80"> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     <MonthCdPP>APRIL</MonthCdPP> 
     </MonthlyPTCInformationGrpPP> 
     <MonthlyPTCInformationGrpPP uuid="634d5af9-51fb-42ee-a90d-5a4f421e6854"> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     <MonthCdPP>JUNE</MonthCdPP> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     </MonthlyPTCInformationGrpPP> 
     <MonthlyPTCInformationGrpPP uuid="a2f7de3f-650c-4a5e-b26c-30cfd7782d6c"> 
     <MonthCdPP>MAY</MonthCdPP> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     </MonthlyPTCInformationGrpPP> 
     <MonthlyPTCInformationGrpPP uuid="a77f40a2-af31-4404-a27d-4c1eaad730c2"> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     <MonthCdPP>JANUARY</MonthCdPP> 
     </MonthlyPTCInformationGrpPP> 
     <MonthlyPTCInformationGrpPP uuid="01650aee-9d5d-4ce1-9079-ebedea3bf416"> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     <MonthCdPP>MARCH</MonthCdPP> 
     </MonthlyPTCInformationGrpPP> 
     <MonthlyPTCInformationGrpPP uuid="581ba189-222d-4999-aa1a-3b290666ef5f"> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     <MonthCdPP>AUGUST</MonthCdPP> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     </MonthlyPTCInformationGrpPP> 
     <TotalPremiumSLCSPAmtPP>3000</TotalPremiumSLCSPAmtPP> 
     <MonthlyPTCInformationGrpPP uuid="549ff57a-58dc-4365-b05c-e3e520b3e8cb"> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     <MonthCdPP>DECEMBER</MonthCdPP> 
     </MonthlyPTCInformationGrpPP> 
     <MonthlyPTCInformationGrpPP uuid="195836cf-32b3-4316-99d4-6b1eab31e16d"> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     <MonthCdPP>JULY</MonthCdPP> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     </MonthlyPTCInformationGrpPP> 
     <MonthlyPTCInformationGrpPP uuid="c1289d91-7ce1-41ee-9c8a-f72212e82752"> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     <MonthCdPP>FEBRUARY</MonthCdPP> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     </MonthlyPTCInformationGrpPP> 
     <TotalAdvancedPTCAmtPP>1500</TotalAdvancedPTCAmtPP> 
     <RecipientSSNPP>555-11-2222</RecipientSSNPP> 
     <MonthlyPTCInformationGrpPP uuid="50876222-165d-442a-81e0-0b05dc3c30fb"> 
     <MonthlyAdvancedPTCAmtPP>125</MonthlyAdvancedPTCAmtPP> 
     <MonthlyPlanPremiumAmtPP>136</MonthlyPlanPremiumAmtPP> 
     <MonthCdPP>NOVEMBER</MonthCdPP> 
     <MonthlyPremiumSLCSPAmtPP>250</MonthlyPremiumSLCSPAmtPP> 
     </MonthlyPTCInformationGrpPP> 
     <TotalPlanPremiumAmtPP>1632</TotalPlanPremiumAmtPP> 
    </IRS1095A> 
    <IRS1040> 
     <IndividualReturnFilingStatusCd>1</IndividualReturnFilingStatusCd> 
     <WagesSalariesAndTipsAmt>22000</WagesSalariesAndTipsAmt> 
     <TotalExemptionsCnt>1</TotalExemptionsCnt> 
     <AdjustedGrossIncomeAmt>22000</AdjustedGrossIncomeAmt> 
    </IRS1040> 
    </ReturnData> 
    <ReturnHeader> 
    <SelfSelectPINGrp> 
     <PrimaryBirthDt>1970-01-01</PrimaryBirthDt> 
    </SelfSelectPINGrp> 
    <Filer> 
     <PrimarySSN>555-11-2222</PrimarySSN> 
     <PrimaryResidentStatesInfoGrpPP> 
     <ResidentStateInfoPP uuid="a77f40a2-af31-4404-a27d-4c1eaad730c2"> 
      <ResidentStateAbbreviationCdPP>CA</ResidentStateAbbreviationCdPP> 
     </ResidentStateInfoPP> 
     </PrimaryResidentStatesInfoGrpPP> 
    </Filer> 
    </ReturnHeader> 
</Return> 
+0

我注意到重复的条目被跳过,只打印出唯一身份例如,有多个节点,除了与不同 – mosawi

+0

应该改变键的名称与一些递增,以避免这种PB都一样吗?不是一个哈希映射关键的唯一性? – skoll

+0

或者只是不使用地图,而是使用列表而不是列表 – skoll

回答

1

你必须通过由识别名称相同的多个元素的UID。通过浏览XML属性来提取它们。

String name = "",value = "", attrName = ""; 
    while(xr.hasNext()) { 
     int e = xr.next(); 
     switch (e) 
     { 
      case XMLStreamReader.START_ELEMENT: 
      { 
       name = xr.getLocalName(); 
       final int attributeCount = xr.getAttributeCount(); 
       if(attributeCount > 0) 
       { 
        attrName = xr.getAttributeName(0).getLocalPart(); 
        final String attributeValue = xr.getAttributeValue(0); 
        System.out.println(name + " " + attrName + " " + attributeValue); 
       } 
       break; 
      } 
      case XMLStreamReader.CHARACTERS: 
      { 
       value = xr.getText(); 
       break; 
      } 
     } 
    } 

运行,让你这个

IRS1095A uuid a77f40a2-af31-4404-a27d-4c1eaad730c2 
MonthlyPTCInformationGrpPP uuid 69dc9dd5-5415-4ee4-a199-19b2dbb701be 
MonthlyPTCInformationGrpPP uuid 8495fa61-0e7c-45e3-8f07-9765f4ef2fc3 
MonthlyPTCInformationGrpPP uuid 7de1052f-6107-41da-aea4-e4495018fc80 
MonthlyPTCInformationGrpPP uuid 634d5af9-51fb-42ee-a90d-5a4f421e6854 
MonthlyPTCInformationGrpPP uuid a2f7de3f-650c-4a5e-b26c-30cfd7782d6c 
MonthlyPTCInformationGrpPP uuid a77f40a2-af31-4404-a27d-4c1eaad730c2 
MonthlyPTCInformationGrpPP uuid 01650aee-9d5d-4ce1-9079-ebedea3bf416 
MonthlyPTCInformationGrpPP uuid 581ba189-222d-4999-aa1a-3b290666ef5f 
MonthlyPTCInformationGrpPP uuid 549ff57a-58dc-4365-b05c-e3e520b3e8cb 
MonthlyPTCInformationGrpPP uuid 195836cf-32b3-4316-99d4-6b1eab31e16d 
MonthlyPTCInformationGrpPP uuid c1289d91-7ce1-41ee-9c8a-f72212e82752 
MonthlyPTCInformationGrpPP uuid 50876222-165d-442a-81e0-0b05dc3c30fb 
ResidentStateInfoPP uuid a77f40a2-af31-4404-a27d-4c1eaad730c2 

使用XMLStreamReader.CHARACTERS不断提取字符。包含元素名称的Map<String, String>将无法​​用于展开此XML。我建议使用XPATH

+0

但是值呢?我不关心除了将它们识别为同名节点的唯一实体之外的属性 – mosawi

+1

'value = xr.getText();''在'XMLStreamReader.CHARACTERS'下提取这些值。小心'/ n'。 –

+0

我明白了,当你说使用XPATH而不是Map 时,你能举一个例子吗?我对此很陌生。最后,我想关键是一个XPath如钥匙“ResidentStateInfoPP”我要打印为“[/返回/ ReturnHeader /文件管理器/ PrimaryResidentStatesInfoGrpPP/ResidentStateInfoPP/ResidentStateAbbreviationCdPP”,CA] – mosawi