我试图在MongoDB中对一个大集合执行查询,实际上查询由两部分组成,并且总共需要执行大约900ms,我需要它快得多。MongoDB,用bigdata缓慢查询
这些都是收藏品,stoptimes:
> db.stoptimes.find().limit(1);
{
"trip_id": "24893A459B661",
"arrival_time": "22:30:00",
"departure_time": "22:30:00",
"stop_id": "1904",
"stop_sequence": 2,
"stop_headsign": "",
"pickup_type": "0",
"drop_off_type": "0",
"shape_dist_traveled": "0.88659123054",
"agency_key": "alamedaoakland-ferry",
"_id": ObjectId("52b394c680052ea30918fd62")
}
> db.stoptimes.count();
5959551
和旅行:
> db.trips.find().limit(1);
{
"route_id": "60",
"service_id": "180A536",
"trip_id": "23736A180B536",
"trip_short_name": "",
"trip_headsign": "San Francisco via Pier 41",
"direction_id": "",
"block_id": "282",
"shape_id": "30",
"trip_bikes_allowed": "2",
"agency_key": "alamedaoakland-ferry",
"_id": ObjectId("52b394c780052ea30918ff34")
}
> db.trips.count();
204884
我试图找到里面的每个不同的车次ROUTE_ID这里收集是trip_id等于每旅程编号与停止时间的给定stop_id匹配。
------ stoptimes --- -> ---------- trips -----------------
stop_id1 -> trip_id1 -> trip_id1 -> route_id1 -> route_id1
-> trip_id2 -> trip_id2 -> route_id2 -> route_id2
-> trip_id3 -> trip_id3 -> route_id2
-> trip_id4 -> trip_id4 -> route_id2
-> trip_id5 -> trip_id5 -> route_id3 -> route_id3
这是MongoDB中壳查询:
> var tripids = db.stoptimes.aggregate([
... {$match : { 'stop_id' : '1904' }},
... {$project : { '_id' : 0, 'trip_id' : 1 }}
... ]);
> var arr = [];
> for(var i=0; i<tripids.result.length; i++)
... { arr.push(tripids.result[i].trip_id); }
> db.trips.aggregate([
... {$match : { 'trip_id' : {$in : arr}}},
... {$group : {
... _id : "$route_id",
... direction_id : { $first: '$direction_id'},
... shape_id : {$first : '$shape_id'}}}
... ])
这是一块,我使用,请注意,这是Node.js的+猫鼬的JavaScript,但它应该很容易阅读因为它是平凡的javascript:
StopTime
.aggregate([
{$match : {
'stop_id' : stop_id
}},
{$project : {
'_id' : 0,
'trip_id' : 1
}}
], function (err, trip_ids){
var arr = [];
for(var i=0;i<trip_ids.length;i++) {
arr.push(trip_ids[i].trip_id);
}
Trip
.aggregate([
{$match : {
'trip_id' : {$in : arr}
}},
{$group : {
_id : "$route_id",
direction_id : { $first: '$direction_id'},
shape_id : { $first: '$shape_id'}
}}
], function (err, route_ids){
cb(err, route_ids);
});
});
我该怎么做才能提高性能?
编辑:
这是这么长时间的唯一查询:
> db.trips.aggregate([
... {$match : { 'trip_id' : {$in : arr}}},
... {$group : {
... _id : "$route_id",
... direction_id : { $first: '$direction_id'},
... shape_id : {$first : '$shape_id'}}}
... ])