我有一个xml数据集,其格式如下。r xml到属性的数据框
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE population SYSTEM "http://www.matsim.org/files/dtd/population_v5.dtd">
<population>
<!-- ====================================================================== -->
\t <person id="10000061">
\t \t <plan score="219.62581874242716" selected="yes">
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" />
\t \t \t <leg mode="ride" dep_time="15:07:00" trav_time="00:03:27" arr_time="15:10:27">
\t \t \t \t <route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route>
\t \t \t </leg>
\t \t \t <act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" />
\t \t \t <leg mode="car" dep_time="16:21:00" trav_time="00:09:44" arr_time="16:30:44">
\t \t \t \t <route type="links">13904 21207 21208 13980 21187 21188 14148 14144 14130 14129</route>
\t \t \t </leg>
\t \t \t <act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" />
\t \t \t <leg mode="ride" dep_time="17:37:00" trav_time="00:09:46" arr_time="17:46:46">
\t \t \t \t <route type="links">14129 14143 14147 14161 14171 14189 14195 14120 14106 14051 13941 13938 13976 14044 21259 21258</route>
\t \t \t </leg>
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" />
\t \t </plan>
\t \t <plan score="218.9756035020247" selected="no">
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" />
\t \t \t <leg mode="ride" dep_time="15:07:00" trav_time="00:03:26" arr_time="15:10:26">
\t \t \t \t <route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route>
\t \t \t </leg>
\t \t \t <act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" />
\t \t \t <leg mode="car" dep_time="16:21:00" trav_time="00:08:46" arr_time="16:29:46">
\t \t \t \t <route type="links">13904 13905 13891 13855 21239 21240 13887 13885 13869 13870 13920 13974 14070 14075 14103 14109 14123 14129</route>
\t \t \t </leg>
\t \t \t <act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" />
\t \t \t <leg mode="ride" dep_time="17:37:00" trav_time="00:11:06" arr_time="17:48:06">
\t \t \t \t <route type="links">14129 14143 14147 14161 14150 14098 14094 14095 14113 14106 14051 13941 13938 13976 14044 21259 21258</route>
\t \t \t </leg>
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" />
\t \t </plan>
\t \t <plan score="218.5148700010285" selected="no">
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" />
\t \t \t <leg mode="ride" dep_time="15:07:00" trav_time="00:03:26" arr_time="15:10:26">
\t \t \t \t <route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route>
\t \t \t </leg>
\t \t \t <act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" />
\t \t \t <leg mode="car" dep_time="16:21:00" trav_time="00:08:15" arr_time="16:29:15">
\t \t \t \t <route type="links">13904 13905 13906 13980 21187 21188 14148 14144 14130 14129</route>
\t \t \t </leg>
\t \t \t <act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" />
\t \t \t <leg mode="ride" dep_time="17:37:00" trav_time="00:11:18" arr_time="17:48:18">
\t \t \t \t <route type="links">14129 14130 14124 14110 14104 14077 14071 13975 13921 13871 13868 13884 13886 13888 13894 13904 13918 13924 13938 13976 14044 21259 21258</route>
\t \t \t </leg>
\t \t \t <act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" />
\t \t </plan>
\t </person>
<!-- ====================================================================== -->
\t <person id="10000302">
\t \t <plan score="209.66504470021556" selected="yes">
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" />
\t \t \t <leg mode="car" dep_time="07:56:00" trav_time="00:03:00" arr_time="07:59:00">
\t \t \t \t <route type="links">21256 13966 14056 14057</route>
\t \t \t </leg>
\t \t \t <act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" />
\t \t \t <leg mode="car" dep_time="10:28:00" trav_time="00:08:20" arr_time="10:36:20">
\t \t \t \t <route type="links">14057 14049 14045 13977 13939 13925 13919 21207 21208 13980 14046 14095 21191</route>
\t \t \t </leg>
\t \t \t <act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" />
\t \t \t <leg mode="car" dep_time="11:52:00" trav_time="00:08:33" arr_time="12:00:33">
\t \t \t \t <route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route>
\t \t \t </leg>
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" />
\t \t \t <leg mode="car" dep_time="12:11:00" trav_time="00:06:35" arr_time="12:17:35">
\t \t \t \t <route type="links">21256 21257 21258 14045 13977 13939 13925 13919 13905 13906</route>
\t \t \t </leg>
\t \t \t <act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" />
\t \t \t <leg mode="car" dep_time="13:30:00" trav_time="00:05:30" arr_time="13:35:30">
\t \t \t \t <route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route>
\t \t \t </leg>
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" />
\t \t </plan>
\t \t <plan score="205.5456839457717" selected="no">
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" />
\t \t \t <leg mode="car" dep_time="07:56:00" trav_time="00:02:15" arr_time="07:58:15">
\t \t \t \t <route type="links">21256 13966 14056 14057</route>
\t \t \t </leg>
\t \t \t <act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" />
\t \t \t <leg mode="car" dep_time="10:28:00" trav_time="00:06:51" arr_time="10:34:51">
\t \t \t \t <route type="links">14057 14056 14177 14191 14214 14247 14235 14213 14211 14198 14120 14114 21191</route>
\t \t \t </leg>
\t \t \t <act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" />
\t \t \t <leg mode="car" dep_time="11:52:00" trav_time="00:07:45" arr_time="11:59:45">
\t \t \t \t <route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route>
\t \t \t </leg>
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" />
\t \t \t <leg mode="car" dep_time="12:11:00" trav_time="00:07:51" arr_time="12:18:51">
\t \t \t \t <route type="links">21256 13915 13823 13767 13743 13731 13732 13837 13831 13819 13820 13854 13890 13906</route>
\t \t \t </leg>
\t \t \t <act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" />
\t \t \t <leg mode="car" dep_time="13:30:00" trav_time="00:08:54" arr_time="13:38:54">
\t \t \t \t <route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route>
\t \t \t </leg>
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" />
\t \t </plan>
\t \t <plan score="203.4205865037132" selected="no">
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" />
\t \t \t <leg mode="car" dep_time="07:56:00" trav_time="00:03:15" arr_time="07:59:15">
\t \t \t \t <route type="links">21256 13966 14056 14057</route>
\t \t \t </leg>
\t \t \t <act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" />
\t \t \t <leg mode="car" dep_time="10:28:00" trav_time="00:06:41" arr_time="10:34:41">
\t \t \t \t <route type="links">14057 14049 14045 13977 13939 13940 14050 14105 14114 21191</route>
\t \t \t </leg>
\t \t \t <act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" />
\t \t \t <leg mode="car" dep_time="11:52:00" trav_time="00:09:12" arr_time="12:01:12">
\t \t \t \t <route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route>
\t \t \t </leg>
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" />
\t \t \t <leg mode="car" dep_time="12:11:00" trav_time="00:05:10" arr_time="12:16:10">
\t \t \t \t <route type="links">21256 13966 14049 14045 13977 13939 13925 13919 13905 13906</route>
\t \t \t </leg>
\t \t \t <act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" />
\t \t \t <leg mode="car" dep_time="13:30:00" trav_time="00:05:30" arr_time="13:35:30">
\t \t \t \t <route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route>
\t \t \t </leg>
\t \t \t <act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" />
\t \t </plan>
\t </person>
<!-- ====================================================================== -->
</population>
从这个XML数据,我想生成具有以下结构的数据帧。
person score selected act.typ act.x act.y act_start act_end leg.mod leg_dep leg_trav leg_arr
10000061 219.6258 yes home 334867.2 3126571 3:00:00 15:07:00 ride 15:07:00 0:03:27 15:10:27
10000061 219.6258 yes shop 332634.9 3127079 15:12:00 16:21:00 car 16:21:00 0:09:44 16:30:44
10000061 219.6258 yes shop 331666.4 3129306 16:25:00 17:37:00 ride 17:37:00 0:09:46 17:46:46
10000061 219.6258 yes home 334867.2 3126571 17:45:00 26:59:00 NA NA NA NA
10000302 209.665 yes home 334598.4 3126269 3:00:00 7:56:00 car 7:56:00 0:03:00 7:59:00
10000302 209.665 yes work 335957.1 3128105 8:04:00 10:28:00 car 10:28:00 0:08:20 10:36:20
10000302 209.665 yes social 333032.8 3128760 10:33:00 11:52:00 car 11:52:00 0:08:33 12:00:33
10000302 209.665 yes home 334598.4 3126269 11:59:00 12:11:00 car 12:11:00 0:06:35 12:17:35
10000302 209.665 yes social 332302.2 3127536 12:17:00 13:30:00 car 13:30:00 0:05:30 13:35:30
10000302 209.665 yes home 334598.4 3126269 13:36:00 26:59:00 NA NA NA NA
我已经看过了XML图书馆和各种功能,如XpathApply
和xmlGetAttr
,并能够在每个节点级别检索各列表和dataframes。但是,正如您所看到的,我需要通过维护父子关系将这些不同的数据子集连接成一个数据框。此外,我只想选择属性所选的值为“是”的父属性和子属性。我也不想要最终节点的值,即路由。我在这里只粘贴了两个人的信息。实际数据有300万人的信息。