2016-01-11 61 views

回答

0

我发现正是这样做的一个非常有用的命令行工具,写在节点。我发现它的源代码在这个任务中非常有用。下面是包的存储库的链接:https://github.com/lgraubner/node-sitemap-generator-cli

这里是我最后使用的代码:

var Crawler = require('simplecrawler'); 

var port = 80; 
var exclude = ['gif', 'jpg', 'jpeg', 'png', 'ico', 'bmp', 'ogg', 'webp', 
    'mp4', 'webm', 'mp3', 'ttf', 'woff', 'json', 'rss', 'atom', 'gz', 'zip', 
    'rar', '7z', 'css', 'js', 'gzip', 'exe']; 
var exts = exclude.join('|'); 
var regex = new RegExp('\.(' + exts + ')', 'i'); // This is used for filtering crawl items. 
var crawler = new Crawler('www.website.com'); 

var pages = []; // This array will hold all the URLs 

// Crawler configuration 
crawler.initialPort = port; 
crawler.initalPath = '/'; 

crawler.addFetchCondition(function (parsedURL) { 
    return !parsedURL.path.match(regex); // This will reject anything that's not a link. 
}); 

// Run the crawler 
crawler.start(); 

crawler.on('fetchcomplete', function(item, responseBuffer, response) { 
    pages.push(item.url); // Add URL to the array of pages 
}); 
1

林不知道,但我并不快乐与工具, 我使用履带式包写一些代码,它建立一个网站地图,全自动

var Crawler = require("crawler"); 
var url = require('url'); 
var fs = require('fs'); 




var writeStream = fs.createWriteStream('./output'); 
writeStream.write('<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation=" http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">'); 

var strBuff = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:mobile="http://www.google.com/schemas/sitemap-mobile/1.0" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">'; 



var Router = require('routes'); 
var router = Router(); 
var noop = function(){}; 

var peroid = { 
    '/':'hourly', 
    '/results':'hourly', 
    '/tips': 'hourly', 
    '/tips/:country':'hourly', 
    '/tips/:country/:venue':'hourly', 
    '/support':'hourly', 

} 


function addToMap(url) { 
    var key = router.match(url.replace('https://www.yourwebsite.com','')); 
    if(!key) { 
     key = {}; 
     key.route = '/'; 
    } else { 
     console.log('match ', url); 
    } 
    var route = key.route; 
    var freq = peroid[route]; 
    var buf = '<url>\n<loc>'+url+'</loc>\n <changefreq>'+freq+'</changefreq>\n<priority>0.5</priority>\n</url>'; 
    strBuff += '<url>\n<loc>'+url+'</loc>\n <changefreq>'+freq+'</changefreq>\n<priority>0.5</priority>\n</url>'; 

    writeStream.write(buf); 

} 

function saveTofile() { 
    console.log('end'); 
    writeStream.write('\n</urlset>'); 
    writeStream.end(); 
} 



router.addRoute("/", noop); 
router.addRoute("/tips", noop); 
router.addRoute("/tips/:country", noop); 
router.addRoute("/tips/:country/:venue", noop); 
router.addRoute("/support", noop); 
router.addRoute("/algorithm", noop); 





var cache = {}; 

var c = new Crawler({ 
    maxConnections : 25, 
    skipDuplicates: true, 
    // This will be called for each crawled page 
    onDrain: function() { 
     console.log('ondrain'); 
     saveTofile(); 
    }, 
    callback : function (error, result, $) { 
     if(error || !$) { 
      console.log(error, result.uri); 
      return; 
     } 
     $('a').each(function(index, a) { 
      var toQueueUrl = $(a).attr('href'); 
      if(!toQueueUrl) { 
       return; 
      } 

      if((toQueueUrl && toQueueUrl[0] !== '/') || toQueueUrl.indexOf('/api/') !== -1 || toQueueUrl.indexOf('.pdf') !== -1) { 
       //console.log('not crawliing', toQueueUrl); 
       return; 
      } 
      if(cache.hasOwnProperty(toQueueUrl) || !toQueueUrl) { 
       return; 
      } 
      //console.log(toQueueUrl); 
      c.queue('https://www.yourwebsite.com'+toQueueUrl); 

      addToMap('https://www.yourwebsite.com'+toQueueUrl); 

      cache[toQueueUrl] = 1; 

      var keyz = Object.keys(cache); 
      if(! (keyz.length % 100)) { 
       console.log('total', keyz.length); 
      } 
     }); 
    } 
}); 


c.queue('https://www.yourwebsite.com'); 

希望它可以帮助ü