2016-02-19 44 views
0

我有一个xml数据集,其格式如下。r xml到属性的数据框

<?xml version="1.0" encoding="utf-8"?> 
 
<!DOCTYPE population SYSTEM "http://www.matsim.org/files/dtd/population_v5.dtd"> 
 

 
<population> 
 

 
<!-- ====================================================================== --> 
 

 
\t <person id="10000061"> 
 
\t \t <plan score="219.62581874242716" selected="yes"> 
 
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" /> 
 
\t \t \t <leg mode="ride" dep_time="15:07:00" trav_time="00:03:27" arr_time="15:10:27"> 
 
\t \t \t \t <route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" /> 
 
\t \t \t <leg mode="car" dep_time="16:21:00" trav_time="00:09:44" arr_time="16:30:44"> 
 
\t \t \t \t <route type="links">13904 21207 21208 13980 21187 21188 14148 14144 14130 14129</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" /> 
 
\t \t \t <leg mode="ride" dep_time="17:37:00" trav_time="00:09:46" arr_time="17:46:46"> 
 
\t \t \t \t <route type="links">14129 14143 14147 14161 14171 14189 14195 14120 14106 14051 13941 13938 13976 14044 21259 21258</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" /> 
 
\t \t </plan> 
 

 
\t \t <plan score="218.9756035020247" selected="no"> 
 
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" /> 
 
\t \t \t <leg mode="ride" dep_time="15:07:00" trav_time="00:03:26" arr_time="15:10:26"> 
 
\t \t \t \t <route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" /> 
 
\t \t \t <leg mode="car" dep_time="16:21:00" trav_time="00:08:46" arr_time="16:29:46"> 
 
\t \t \t \t <route type="links">13904 13905 13891 13855 21239 21240 13887 13885 13869 13870 13920 13974 14070 14075 14103 14109 14123 14129</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" /> 
 
\t \t \t <leg mode="ride" dep_time="17:37:00" trav_time="00:11:06" arr_time="17:48:06"> 
 
\t \t \t \t <route type="links">14129 14143 14147 14161 14150 14098 14094 14095 14113 14106 14051 13941 13938 13976 14044 21259 21258</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" /> 
 
\t \t </plan> 
 

 
\t \t <plan score="218.5148700010285" selected="no"> 
 
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" /> 
 
\t \t \t <leg mode="ride" dep_time="15:07:00" trav_time="00:03:26" arr_time="15:10:26"> 
 
\t \t \t \t <route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" /> 
 
\t \t \t <leg mode="car" dep_time="16:21:00" trav_time="00:08:15" arr_time="16:29:15"> 
 
\t \t \t \t <route type="links">13904 13905 13906 13980 21187 21188 14148 14144 14130 14129</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" /> 
 
\t \t \t <leg mode="ride" dep_time="17:37:00" trav_time="00:11:18" arr_time="17:48:18"> 
 
\t \t \t \t <route type="links">14129 14130 14124 14110 14104 14077 14071 13975 13921 13871 13868 13884 13886 13888 13894 13904 13918 13924 13938 13976 14044 21259 21258</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" /> 
 
\t \t </plan> 
 

 
\t </person> 
 

 
<!-- ====================================================================== --> 
 

 
\t <person id="10000302"> 
 
\t \t <plan score="209.66504470021556" selected="yes"> 
 
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" /> 
 
\t \t \t <leg mode="car" dep_time="07:56:00" trav_time="00:03:00" arr_time="07:59:00"> 
 
\t \t \t \t <route type="links">21256 13966 14056 14057</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" /> 
 
\t \t \t <leg mode="car" dep_time="10:28:00" trav_time="00:08:20" arr_time="10:36:20"> 
 
\t \t \t \t <route type="links">14057 14049 14045 13977 13939 13925 13919 21207 21208 13980 14046 14095 21191</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" /> 
 
\t \t \t <leg mode="car" dep_time="11:52:00" trav_time="00:08:33" arr_time="12:00:33"> 
 
\t \t \t \t <route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" /> 
 
\t \t \t <leg mode="car" dep_time="12:11:00" trav_time="00:06:35" arr_time="12:17:35"> 
 
\t \t \t \t <route type="links">21256 21257 21258 14045 13977 13939 13925 13919 13905 13906</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" /> 
 
\t \t \t <leg mode="car" dep_time="13:30:00" trav_time="00:05:30" arr_time="13:35:30"> 
 
\t \t \t \t <route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" /> 
 
\t \t </plan> 
 

 
\t \t <plan score="205.5456839457717" selected="no"> 
 
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" /> 
 
\t \t \t <leg mode="car" dep_time="07:56:00" trav_time="00:02:15" arr_time="07:58:15"> 
 
\t \t \t \t <route type="links">21256 13966 14056 14057</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" /> 
 
\t \t \t <leg mode="car" dep_time="10:28:00" trav_time="00:06:51" arr_time="10:34:51"> 
 
\t \t \t \t <route type="links">14057 14056 14177 14191 14214 14247 14235 14213 14211 14198 14120 14114 21191</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" /> 
 
\t \t \t <leg mode="car" dep_time="11:52:00" trav_time="00:07:45" arr_time="11:59:45"> 
 
\t \t \t \t <route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" /> 
 
\t \t \t <leg mode="car" dep_time="12:11:00" trav_time="00:07:51" arr_time="12:18:51"> 
 
\t \t \t \t <route type="links">21256 13915 13823 13767 13743 13731 13732 13837 13831 13819 13820 13854 13890 13906</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" /> 
 
\t \t \t <leg mode="car" dep_time="13:30:00" trav_time="00:08:54" arr_time="13:38:54"> 
 
\t \t \t \t <route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" /> 
 
\t \t </plan> 
 

 
\t \t <plan score="203.4205865037132" selected="no"> 
 
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" /> 
 
\t \t \t <leg mode="car" dep_time="07:56:00" trav_time="00:03:15" arr_time="07:59:15"> 
 
\t \t \t \t <route type="links">21256 13966 14056 14057</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" /> 
 
\t \t \t <leg mode="car" dep_time="10:28:00" trav_time="00:06:41" arr_time="10:34:41"> 
 
\t \t \t \t <route type="links">14057 14049 14045 13977 13939 13940 14050 14105 14114 21191</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" /> 
 
\t \t \t <leg mode="car" dep_time="11:52:00" trav_time="00:09:12" arr_time="12:01:12"> 
 
\t \t \t \t <route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" /> 
 
\t \t \t <leg mode="car" dep_time="12:11:00" trav_time="00:05:10" arr_time="12:16:10"> 
 
\t \t \t \t <route type="links">21256 13966 14049 14045 13977 13939 13925 13919 13905 13906</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" /> 
 
\t \t \t <leg mode="car" dep_time="13:30:00" trav_time="00:05:30" arr_time="13:35:30"> 
 
\t \t \t \t <route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route> 
 
\t \t \t </leg> 
 
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" /> 
 
\t \t </plan> 
 

 
\t </person> 
 

 
<!-- ====================================================================== --> 
 

 
</population>

从这个XML数据,我想生成具有以下结构的数据帧。

person score  selected act.typ act.x  act.y act_start act_end  leg.mod leg_dep leg_trav leg_arr 
10000061 219.6258 yes   home 334867.2 3126571 3:00:00  15:07:00 ride  15:07:00 0:03:27  15:10:27 
10000061 219.6258 yes   shop 332634.9 3127079 15:12:00 16:21:00 car  16:21:00 0:09:44  16:30:44 
10000061 219.6258 yes   shop 331666.4 3129306 16:25:00 17:37:00 ride  17:37:00 0:09:46  17:46:46 
10000061 219.6258 yes   home 334867.2 3126571 17:45:00 26:59:00 NA  NA   NA   NA 
10000302 209.665  yes   home 334598.4 3126269 3:00:00  7:56:00  car  7:56:00 0:03:00  7:59:00 
10000302 209.665  yes   work 335957.1 3128105 8:04:00  10:28:00 car  10:28:00 0:08:20  10:36:20 
10000302 209.665  yes   social 333032.8 3128760 10:33:00 11:52:00 car  11:52:00 0:08:33  12:00:33 
10000302 209.665  yes   home 334598.4 3126269 11:59:00 12:11:00 car  12:11:00 0:06:35  12:17:35 
10000302 209.665  yes   social 332302.2 3127536 12:17:00 13:30:00 car  13:30:00 0:05:30  13:35:30 
10000302 209.665  yes   home 334598.4 3126269 13:36:00 26:59:00 NA  NA   NA   NA 

我已经看过了XML图书馆和各种功能,如XpathApplyxmlGetAttr,并能够在每个节点级别检索各列表和dataframes。但是,正如您所看到的,我需要通过维护父子关系将这些不同的数据子集连接成一个数据框。此外,我只想选择属性所选的值为“是”的父属性和子属性。我也不想要最终节点的值,即路由。我在这里只粘贴了两个人的信息。实际数据有300万人的信息。

回答

0

既然发布了这个问题,我做了一些研究,并遇到了这个post,并用它来提出一个答案。以下是我的代码。我相信,这可以以非常有效的方式完成。因此,我不会接受这个作为最终答案,直到有人看一看并批准它或提供一个更好的选择。

library(XML) 
 
file1 <- "C:/Users/s/Desktop/plans.xml" 
 
plans <- xmlParse(file1) 
 

 
idNodes <- getNodeSet(plans, "//person[@id]") 
 
ids <- lapply(idNodes, function(x) xmlAttrs(x)['id']) 
 

 
attribact <- vector("list", length(ids)*14) 
 
k <- 1 
 
for (i in 1:length(ids)) 
 
{ 
 
    act <- xpathApply(idNodes[[i]], path = paste("//person[@id=", as.numeric(ids[[i]]),"]//plan[@selected='yes']//act|leg", sep=""), xmlAttrs) 
 
    for (j in 1:length(act)) 
 
    { 
 
    attribact[[k]] <- c(act[[j]],ids[[i]]) 
 
    k <- k+1  
 
    } 
 
} 
 
attribact <- attribact[attribact!='NULL'] 
 
k1 <- t(as.data.frame(attribact)) 
 

 
attribleg <- vector("list", length(ids)*10) 
 
k <- 1 
 
for (i in 1:length(ids)) 
 
{ 
 
    leg <- xpathApply(idNodes[[i]], path = paste("//person[@id=", as.numeric(ids[[i]]),"]//plan[@selected='yes']//leg", sep=""), xmlAttrs) 
 
    leg[[length(leg)+1]] <- c(NA,NA,NA,NA) 
 
    for (j in 1:length(leg)) 
 
    { 
 
    attribleg[[k]] <- c(leg[[j]],ids[[i]]) 
 
    k <- k+1  
 
    } 
 
} 
 
attribleg <- attribleg[attribleg!='NULL'] 
 
k2 <- t(as.data.frame(attribleg)) 
 

 
data1 <- cbind(k1,k2)

1

每当有复杂的XML文件的处理,考虑一个XSLT的解决方案。 XSLT是一种声明性的专用语言(与SQL类型相同),用于转换XML文档。由于数据集需要行和列的两个维度,因此XSLT可以转换嵌套节点和属性以满足此结构。

虽然R没有通用的XSLT库,但R可以使用system()来调用外部程序来处理XSLT转换。下面是开源Python示例,但几乎任何通用目的(Java,C#,PHP,Perl,甚至Excel VBA)语言,专用可执行处理器(Xalan and Saxon)或命令行程序(PowerShell,Bash)都可以处理XSLT。

XSLT脚本(保存为的.xsl或.xslt)

<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> 
<xsl:output version="1.0" encoding="UTF-8" indent="yes" /> 
<xsl:strip-space elements="*"/> 

    <xsl:template match="/"> 
    <data> 
     <xsl:apply-templates select="*"/> 
    </data> 
    </xsl:template> 

    <xsl:template match="person|plan"> 
    <xsl:apply-templates /> 
    </xsl:template> 

    <xsl:template match="act"> 
    <person>  
     <id><xsl:value-of select="ancestor::person/@id"/></id> 
     <score><xsl:value-of select="ancestor::plan/@score"/></score> 
     <selected><xsl:value-of select="ancestor::plan/@selected"/></selected> 
     <type><xsl:value-of select="@type"/></type> 
     <link><xsl:value-of select="@link"/></link> 
     <x><xsl:value-of select="@x"/></x> 
     <y><xsl:value-of select="@y"/></y>  
     <start_time><xsl:value-of select="@start_time"/></start_time> 
     <end_time><xsl:value-of select="@end_time"/></end_time> 
     <mode><xsl:value-of select="following-sibling::leg/@mode"/></mode> 
     <dep_time><xsl:value-of select="following-sibling::leg/@dep_time"/></dep_time> 
     <trav_time><xsl:value-of select="following-sibling::leg/@trav_time"/></trav_time> 
     <arr_time><xsl:value-of select="following-sibling::leg/@arr_time"/></arr_time> 
    </person> 
    </xsl:template> 

    <xsl:template match="route"/> 
</xsl:transform> 

的Python脚本(使用lxml的模块,解析上述脚本)

import lxml.etree as ET 

dom = ET.parse('Input.xml')) 
xslt = ET.parse('XSLTScript.xsl')) 

transform = ET.XSLT(xslt) 
newdom = transform(dom) 

tree_out = ET.tostring(newdom, encoding='UTF-8', pretty_print=True, xml_declaration=True) 

xmlfile = open('Output.xml','wb') 
xmlfile.write(tree_out) 
xmlfile.close() 

XML转换后的输出

<?xml version='1.0' encoding='UTF-8'?> 
<data> 
    <person> 
    <id>10000061</id> 
    <score>219.62581874242716</score> 
    <selected>yes</selected> 
    <type>home</type> 
    <link>21258</link> 
    <x>334867.243653</x> 
    <y>3126570.70778</y> 
    <start_time>03:00:00</start_time> 
    <end_time>15:07:00</end_time> 
    <mode>ride</mode> 
    <dep_time>15:07:00</dep_time> 
    <trav_time>00:03:27</trav_time> 
    <arr_time>15:10:27</arr_time> 
    </person> 
    <person> 
    <id>10000061</id> 
    <score>219.62581874242716</score> 
    <selected>yes</selected> 
    <type>shop</type> 
    <link>13904</link> 
    <x>332634.86999</x> 
    <y>3127078.96383</y> 
    <start_time>15:12:00</start_time> 
    <end_time>16:21:00</end_time> 
    <mode>car</mode> 
    <dep_time>16:21:00</dep_time> 
    <trav_time>00:09:44</trav_time> 
    <arr_time>16:30:44</arr_time> 
    </person> 
    <person> 
    <id>10000061</id> 
    <score>219.62581874242716</score> 
    <selected>yes</selected> 
    <type>shop</type> 
    <link>14129</link> 
    <x>331666.364904</x> 
    <y>3129306.48785</y> 
    <start_time>16:25:00</start_time> 
    <end_time>17:37:00</end_time> 
    <mode>ride</mode> 
    <dep_time>17:37:00</dep_time> 
    <trav_time>00:09:46</trav_time> 
    <arr_time>17:46:46</arr_time> 
    </person> 
    ... 

ř脚本

​​ (以上在命令行Python脚本调用)