From f19a994145490c6132b9e258b35b177fd7a58ac6 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Tue, 9 Jun 2026 13:37:56 +0200 Subject: [PATCH] Speedup gem loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit github-linguist is pretty slow to load, in large part because it has to parse a pretty large json blob (~35ms on my machine). Instead of shipping this data as JSON, we could ship it directly as Ruby code. By default, the Ruby parser perform about as well as JSON & Yajl: ``` ruby 4.0.5 (2026-05-20 revision 64336ffd0e) +YJIT +PRISM [arm64-darwin25] Calculating ------------------------------------- json 27.363 (± 3.7%) i/s (36.55 ms/i) - 138.000 in 5.043304s yajl 28.111 (± 3.6%) i/s (35.57 ms/i) - 142.000 in 5.051466s load 27.719 (± 3.6%) i/s (36.08 ms/i) - 140.000 in 5.050659s Comparison: json: 27.4 i/s yajl: 28.1 i/s - same-ish: difference falls within error load: 27.7 i/s - same-ish: difference falls within error ``` But with Bootsnap, which can probably be assumed of many users, it's over 4 times faster to load the data: ``` ruby 4.0.5 (2026-05-20 revision 64336ffd0e) +YJIT +PRISM [arm64-darwin25] Calculating ------------------------------------- json 28.350 (± 3.5%) i/s (35.27 ms/i) - 142.000 in 5.008777s yajl 30.316 (± 3.3%) i/s (32.99 ms/i) - 152.000 in 5.013793s load+bootsnap 128.326 (± 4.7%) i/s (7.79 ms/i) - 650.000 in 5.065217s Comparison: json: 28.4 i/s load+bootsnap: 128.3 i/s - 4.53x faster yajl: 30.3 i/s - same-ish: difference falls within error ``` This approach could even be taken faster by directly generating Ruby code that calls `Language.create` witht he relevant arguments, but I decided to scope this change to the smallest possible one as to test the waters. Benchmark: ```ruby require 'bundler/inline' gemfile do source 'https://rubygems.org' gem 'yajl-ruby' gem 'json' gem 'benchmark-ips' gem 'bootsnap' end require 'json' require 'yajl' require 'benchmark/ips' if ENV["BOOTSNAP_CACHE_DIR"] require 'bootsnap/setup' end Benchmark.ips do |x| x.report('json') { JSON.parse(File.read("lib/linguist/samples.json"))} x.report('yajl') { Yajl.load(File.read("lib/linguist/samples.json"))} x.report(ENV["BOOTSNAP_CACHE_DIR"] ? 'load+bootsnap' : 'load') { mod = Module.new; load("lib/linguist/samples_data.rb", mod); mod::DATA } x.compare!(order: :baseline) end ``` --- .gitignore | 2 +- Rakefile | 12 +++++------- lib/linguist/language.rb | 14 +++++--------- lib/linguist/samples.rb | 14 ++++---------- 4 files changed, 15 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index 2ff2113c33..f859d3e98e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ .bundle/ .idea benchmark/ -lib/linguist/samples.json +lib/linguist/samples_data.rb /grammars /node_modules test/fixtures/ace_modes.json diff --git a/Rakefile b/Rakefile index eab66d31c8..fbb857c501 100644 --- a/Rakefile +++ b/Rakefile @@ -3,9 +3,7 @@ require 'rake/clean' require 'rake/testtask' require 'rake/extensiontask' require 'yaml' -require 'yajl' require 'open-uri' -require 'json' require 'open3' task :default => :test @@ -23,7 +21,7 @@ task :test => [:compile, :check_samples, :fetch_ace_modes] desc "Check that we have samples.json generated" task :check_samples do - unless File.exist?('lib/linguist/samples.json') + unless File.exist?('lib/linguist/samples_data.rb') Rake::Task[:samples].invoke end end @@ -45,8 +43,8 @@ end task :samples => :compile do require 'linguist/samples' - json = Yajl.dump(Linguist::Samples.data, :pretty => false) - File.write 'lib/linguist/samples.json', json + require 'pp' + File.write 'lib/linguist/samples_data.rb', "#frozen_string_literal: true\nDATA = #{PP.pp(Linguist::Samples.data, +'')}" end task :flex do @@ -94,9 +92,9 @@ task :build_gem => :samples do rm_rf "grammars" sh "script/grammar-compiler compile -o grammars || true" languages = YAML.load_file("lib/linguist/languages.yml") - File.write("lib/linguist/languages.json", Yajl.dump(languages)) + File.write("lib/linguist/languages_data.rb", "#frozen_string_literal: true\nDATA = #{PP.pp(languages, +'')}") `gem build github-linguist.gemspec` - File.delete("lib/linguist/languages.json") + File.delete("lib/linguist/languages_data.rb") end namespace :benchmark do diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 5de99ba551..34976cfbba 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -1,10 +1,5 @@ require 'cgi' require 'yaml' -begin - require 'yajl' -rescue LoadError - require 'json' -end require 'linguist/classifier' require 'linguist/heuristics' @@ -501,11 +496,12 @@ def inspect popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__)) languages_yml = File.expand_path("../languages.yml", __FILE__) - languages_json = File.expand_path("../languages.json", __FILE__) + languages_rb = File.expand_path("../languages_data.rb", __FILE__) - if File.exist?(languages_json) - serializer = defined?(Yajl) ? Yajl : JSON - languages = serializer.load(File.read(languages_json)) + if File.exist?(languages_rb) + mod = Module.new + load(languages_rb, mod) + languages = mod::DATA else languages = YAML.load_file(languages_yml) end diff --git a/lib/linguist/samples.rb b/lib/linguist/samples.rb index ae9bf296af..80917cab9e 100644 --- a/lib/linguist/samples.rb +++ b/lib/linguist/samples.rb @@ -15,7 +15,7 @@ module Samples ROOT = File.expand_path("../../../samples", __FILE__) # Path for serialized samples db - PATH = File.expand_path('../samples.json', __FILE__) + PATH = File.expand_path('../samples_data.rb', __FILE__) # Hash of serialized samples object, cached in memory def self.cache @@ -24,15 +24,9 @@ def self.cache # Hash of serialized samples object, uncached def self.load_samples - serializer = defined?(Yajl) ? Yajl : JSON - data = serializer.load(File.read(PATH, encoding: 'utf-8')) - # JSON serialization does not allow integer keys, we fix them here - for lang in data['centroids'].keys - fixed = data['centroids'][lang].to_a.map { |k,v| [k.to_i, v] } - data['centroids'][lang] = Hash[fixed] - end - - data + mod = Module.new + load(PATH, mod) + mod::DATA end # Public: Iterate over each sample.