From 5e4b8b83679a01a15c2c7e47c5c48450f9756fbd Mon Sep 17 00:00:00 2001 From: Shinji Kuwayama Date: Mon, 2 May 2011 18:41:59 -0500 Subject: [PATCH] Enabling an S3 storage option, for use on read-only filesystems like Heroku. --- .DS_Store | Bin 0 -> 6148 bytes README.md | 166 +++++++++++++++++++++++++++-- lib/sitemap_generator/link_set.rb | 8 +- tasks/sitemap_generator_tasks.rake | 38 ++++++- templates/sitemap.rb | 5 + test/sitemap_generator_test.rb | 13 ++- 6 files changed, 212 insertions(+), 18 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 'zipcode', :action => 'index' + +Don't tell me it's trivial, because it isn't. It just looks trivial. + +So my idea is to have another file similar to 'routes.rb' called 'sitemap.rb', where you can define what goes into the Sitemap. + +Here's my solution: + + Zipcode.find(:all, :include => :city).each do |z| + sitemap.add zipcode_path(:state => z.city.state, :city => z.city, :zipcode => z) + end + +Easy hey? + +Other Sitemap settings for the link, like `lastmod`, `priority`, `changefreq` and `host` are entered automatically, although you can override them if you need to. + +Other "difficult" Sitemap issues, solved by this plugin: + +- Support for more than 50,000 urls (using a Sitemap Index file) +- Gzip of Sitemap files +- Variable priority of links +- Paging/sorting links (e.g. my_list?page=3) +- SSL host links (e.g. https:) +- Rails apps which are installed on a sub-path (e.g. example.com/blog_app/) + +Installation +======= + +**As a gem** + +1. Add the gem as a dependency in your config/environment.rb + + config.gem 'sitemap_generator', :lib => false, :source => 'http://gemcutter.org' + +2. `$ rake gems:install` + +3. Add the following line to your RAILS_ROOT/Rakefile + + require 'sitemap_generator/tasks' rescue LoadError + +4. `$ rake sitemap:install` + +**As a plugin** + +1. Install plugin as normal + + $ ./script/plugin install git://github.com/adamsalter/sitemap_generator.git + +---- + +Installation should create a 'config/sitemap.rb' file which will contain your logic for generation of the Sitemap files. (If you want to recreate this file manually run `rake sitemap:install`) + +You can run `rake sitemap:refresh` as needed to create Sitemap files. This will also ping all the ['major'][sitemap_engines] search engines. (if you want to disable all non-essential output run the rake task thusly `rake -s sitemap:refresh`) + +Sitemaps with many urls (100,000+) take quite a long time to generate, so if you need to refresh your Sitemaps regularly you can set the rake task up as a cron job. Most cron agents will only send you an email if there is output from the cron task. + +Optionally, you can add the following to your robots.txt file, so that robots can find the sitemap file. + + Sitemap: /sitemap_index.xml.gz + +The robots.txt Sitemap URL should be the complete URL to the Sitemap Index, such as: `http://www.example.org/sitemap_index.xml.gz` + + +Example 'config/sitemap.rb' +========== + + # Set the host name for URL creation + SitemapGenerator::Sitemap.default_host = "http://www.example.com" + + SitemapGenerator::Sitemap.add_links do |sitemap| + # Put links creation logic here. + # + # The Root Path ('/') and Sitemap Index file are added automatically. + # Links are added to the Sitemap output in the order they are specified. + # + # Usage: sitemap.add path, options + # (default options are used if you don't specify them) + # + # Defaults: :priority => 0.5, :changefreq => 'weekly', + # :lastmod => Time.now, :host => default_host + + + # Examples: + + # add '/articles' + sitemap.add articles_path, :priority => 0.7, :changefreq => 'daily' + + # add all individual articles + Article.find(:all).each do |a| + sitemap.add article_path(a), :lastmod => a.updated_at + end + + # add merchant path + sitemap.add '/purchase', :priority => 0.7, :host => "https://www.example.com" + + end + +Notes +======= + +1) Tested/working on Rails 1.x.x <=> 2.x.x, no guarantees made for Rails 3.0. + +2) For large sitemaps it may be useful to split your generation into batches to avoid running out of memory. E.g.: + + # add movies + Movie.find_in_batches(:batch_size => 1000) do |movies| + movies.each do |movie| + sitemap.add "/movies/show/#{movie.to_param}", :lastmod => movie.updated_at, :changefreq => 'weekly' + end + end + +3) New Capistrano deploys will remove your Sitemap files, unless you run `rake sitemap:refresh`. The way around this is to create a cap task: + + after "deploy:update_code", "deploy:copy_old_sitemap" + + namespace :deploy do + task :copy_old_sitemap do + run "if [ -e #{previous_release}/public/sitemap_index.xml.gz ]; then cp #{previous_release}/public/sitemap* #{current_release}/public/; fi" + end + end + +Known Bugs +======== + +- Sitemaps.org [states][sitemaps_org] that no Sitemap XML file should be more than 10Mb uncompressed. The plugin will warn you about this, but does nothing to avoid it (like move some URLs into a later file). +- There's no check on the size of a URL which [isn't supposed to exceed 2,048 bytes][sitemaps_xml]. +- Currently only supports one Sitemap Index file, which can contain 50,000 Sitemap files which can each contain 50,000 urls, so it _only_ supports up to 2,500,000,000 (2.5 billion) urls. I personally have no need of support for more urls, but plugin could be improved to support this. + +Thanks (in no particular order) +======== + +- [Karl Varga (aka Bear Grylls)](http://github.com/kjvarga) +- [Dan Pickett](http://github.com/dpickett) +- [Rob Biedenharn](http://github.com/rab) +- [Richie Vos](http://github.com/jerryvos) + -[Karl Varga](http://github.com/kjvarga) has taken over development of SitemapGenerator. +Follow me on: +--------- -The canonical repository is [http://github.com/kjvarga/sitemap_generator][canonical_repo]. +> Twitter: [twitter.com/adamsalter](http://twitter.com/adamsalter) +> Github: [github.com/adamsalter](http://github.com/adamsalter) -Issues should be logged at [http://github.com/kjvarga/sitemap_generator/issues][issues_url]. +Copyright (c) 2009 Adam @ [Codebright.net][cb], released under the MIT license -[canonical_repo]:http://github.com/kjvarga/sitemap_generator -[issues_url]:http://github.com/kjvarga/sitemap_generator/issues \ No newline at end of file +[enterprise_class]:https://twitter.com/dhh/status/1631034662 "I use enterprise in the same sense the Phusion guys do - i.e. Enterprise Ruby. Please don't look down on my use of the word 'enterprise' to represent being a cut above. It doesn't mean you ever have to work for a company the size of IBM. Or constantly fight inertia, writing crappy software, adhering to change management practices and spending hours in meetings... Not that there's anything wrong with that - Wait, what?" +[sitemap_engines]:http://en.wikipedia.org/wiki/Sitemap_index "http://en.wikipedia.org/wiki/Sitemap_index" +[sitemaps_org]:http://www.sitemaps.org/protocol.php "http://www.sitemaps.org/protocol.php" +[sitemaps_xml]:http://www.sitemaps.org/protocol.php#xmlTagDefinitions "XML Tag Definitions" +[sitemap_generator_usage]:http://wiki.github.com/adamsalter/sitemap_generator/sitemapgenerator-usage "http://wiki.github.com/adamsalter/sitemap_generator/sitemapgenerator-usage" +[boost_juice]:http://www.boostjuice.com.au/ "Mmmm, sweet, sweet Boost Juice." +[cb]:http://codebright.net "http://codebright.net" diff --git a/lib/sitemap_generator/link_set.rb b/lib/sitemap_generator/link_set.rb index 4e76e019..1078cb2e 100644 --- a/lib/sitemap_generator/link_set.rb +++ b/lib/sitemap_generator/link_set.rb @@ -1,6 +1,8 @@ module SitemapGenerator + class LinkSet - attr_accessor :default_host, :yahoo_app_id, :links + + attr_accessor :default_host, :yahoo_app_id, :links, :s3_access_key_id, :s3_secret_access_key, :s3_bucket_name def initialize @links = [] @@ -24,5 +26,7 @@ def add_links def add_link(link) @links << link end + end -end \ No newline at end of file + +end diff --git a/tasks/sitemap_generator_tasks.rake b/tasks/sitemap_generator_tasks.rake index cb6dfac7..20fa667e 100644 --- a/tasks/sitemap_generator_tasks.rake +++ b/tasks/sitemap_generator_tasks.rake @@ -1,4 +1,11 @@ require 'zlib' +begin + require 'aws/s3' + include AWS::S3 +rescue LoadError + raise RequiredLibraryNotFoundError.new('AWS::S3 could not be loaded') +end + namespace :sitemap do @@ -7,9 +14,11 @@ namespace :sitemap do load File.expand_path(File.join(File.dirname(__FILE__), "../rails/install.rb")) end - desc "Delete all Sitemap files in public/ directory" + desc "Delete all Sitemap files in public/ and tmp/ directories" task :clean do - sitemap_files = Dir[File.join(RAILS_ROOT, 'public/sitemap*.xml.gz')] + sitemap_files = Dir[File.join(RAILS_ROOT, "/public/sitemap*.xml.gz")] + FileUtils.rm sitemap_files + sitemap_files = Dir[File.join(RAILS_ROOT, "/tmp/sitemap*.xml.gz")] FileUtils.rm sitemap_files end @@ -20,7 +29,7 @@ namespace :sitemap do end desc "Create Sitemap XML files (don't ping search engines)" - task 'refresh:no_ping' => ['sitemap:create'] do + task 'refresh:no_ping' => ['sitemap:create'] do end task :create => [:environment] do @@ -39,33 +48,52 @@ namespace :sitemap do Rake::Task['sitemap:clean'].invoke + s3_enabled = (!SitemapGenerator::Sitemap.s3_access_key_id.blank? && !SitemapGenerator::Sitemap.s3_secret_access_key.blank? && !SitemapGenerator::Sitemap.s3_bucket_name.blank?) + local_storage = (s3_enabled ? 'tmp' : 'public') + if s3_enabled + AWS::S3::Base.establish_connection!( + :access_key_id => SitemapGenerator::Sitemap.s3_access_key_id, + :secret_access_key => SitemapGenerator::Sitemap.s3_secret_access_key + ) + end + # render individual sitemaps sitemap_files = [] links_grps.each_with_index do |links, index| buffer = '' xml = Builder::XmlMarkup.new(:target=>buffer) eval(open(SitemapGenerator.templates[:sitemap_xml]).read, binding) - filename = File.join(RAILS_ROOT, "public/sitemap#{index+1}.xml.gz") + filename = File.join(RAILS_ROOT, "#{local_storage}/sitemap#{index+1}.xml.gz") Zlib::GzipWriter.open(filename) do |gz| gz.write buffer end puts "+ #{filename}" if verbose puts "** Sitemap too big! The uncompressed size exceeds 10Mb" if (buffer.size > 10 * 1024 * 1024) && verbose sitemap_files << filename + if s3_enabled + AWS::S3::S3Object.store(File.basename(filename), open(filename), SitemapGenerator::Sitemap.s3_bucket_name, :access => :public_read) + puts " [uploaded to S3:#{SitemapGenerator::Sitemap.s3_bucket_name}]" if verbose + end end # render index buffer = '' xml = Builder::XmlMarkup.new(:target=>buffer) eval(open(SitemapGenerator.templates[:sitemap_index]).read, binding) - filename = File.join(RAILS_ROOT, "public/sitemap_index.xml.gz") + filename = File.join(RAILS_ROOT, "#{local_storage}/sitemap_index.xml.gz") Zlib::GzipWriter.open(filename) do |gz| gz.write buffer end puts "+ #{filename}" if verbose puts "** Sitemap Index too big! The uncompressed size exceeds 10Mb" if (buffer.size > 10 * 1024 * 1024) && verbose + if s3_enabled + AWS::S3::S3Object.store(File.basename(filename), open(filename), SitemapGenerator::Sitemap.s3_bucket_name, :access => :public_read) + puts " [uploaded to S3:#{SitemapGenerator::Sitemap.s3_bucket_name}]" if verbose + end + stop_time = Time.now puts "Sitemap stats: #{number_with_delimiter(SitemapGenerator::Sitemap.links.length)} links, " + ("%dm%02ds" % (stop_time - start_time).divmod(60)) if verbose end + end diff --git a/templates/sitemap.rb b/templates/sitemap.rb index 9210a159..8bca6c7b 100644 --- a/templates/sitemap.rb +++ b/templates/sitemap.rb @@ -1,3 +1,8 @@ +# Optional: Set Amazon S3 credentials. If omitted, sitemaps will go in /public. +SitemapGenerator::Sitemap.s3_access_key_id = "" +SitemapGenerator::Sitemap.s3_secret_access_key = "" +SitemapGenerator::Sitemap.s3_bucket_name = "" + # Set the host name for URL creation SitemapGenerator::Sitemap.default_host = "http://www.example.com" diff --git a/test/sitemap_generator_test.rb b/test/sitemap_generator_test.rb index c614cca5..c2af788e 100644 --- a/test/sitemap_generator_test.rb +++ b/test/sitemap_generator_test.rb @@ -6,12 +6,16 @@ class SitemapGeneratorTest < Test::Unit::TestCase context "when running the clean task" do setup do copy_sitemap_file_to_rails_app - FileUtils.touch(File.join(RAILS_ROOT, '/public/sitemap_index.xml.gz')) - Rake::Task['sitemap:clean'].invoke + ['public','tmp'].each do |dir| + FileUtils.touch(File.join(RAILS_ROOT, "/#{dir}/sitemap_index.xml.gz")) + Rake::Task['sitemap:clean'].invoke + end end should "the sitemap xml files be deleted" do - assert !File.exists?(File.join(RAILS_ROOT, '/public/sitemap_index.xml.gz')) + ['public','tmp'].each do |dir| + assert !File.exists?(File.join(RAILS_ROOT, '/public/sitemap_index.xml.gz')) + end end end @@ -49,10 +53,11 @@ class SitemapGeneratorTest < Test::Unit::TestCase Rake::Task['sitemap:refresh'].invoke end - should "not create sitemap xml files" do + should "create sitemap xml files" do assert File.exists?(File.join(RAILS_ROOT, '/public/sitemap_index.xml.gz')) assert File.exists?(File.join(RAILS_ROOT, '/public/sitemap1.xml.gz')) end + end end