Skip to content

Commit

Permalink
Merge pull request #763 from jneen/refactor.disambiguators
Browse files Browse the repository at this point in the history
Refactor: Use manual disambiguators for shared filenames
  • Loading branch information
gfx authored Sep 13, 2017
2 parents 95ab4ae + 75c5071 commit 5be8d6e
Show file tree
Hide file tree
Showing 72 changed files with 223 additions and 280 deletions.
16 changes: 9 additions & 7 deletions lib/rouge.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,6 @@ def highlight(text, lexer, formatter, &b)
load load_dir.join('rouge/text_analyzer.rb')
load load_dir.join('rouge/token.rb')

load load_dir.join('rouge/guesser.rb')
load load_dir.join('rouge/guessers/glob_mapping.rb')
load load_dir.join('rouge/guessers/modeline.rb')
load load_dir.join('rouge/guessers/filename.rb')
load load_dir.join('rouge/guessers/mimetype.rb')
load load_dir.join('rouge/guessers/source.rb')

load load_dir.join('rouge/lexer.rb')
load load_dir.join('rouge/regex_lexer.rb')
load load_dir.join('rouge/template_lexer.rb')
Expand All @@ -57,6 +50,15 @@ def highlight(text, lexer, formatter, &b)
Rouge::Lexers.load_lexer(Pathname.new(f).relative_path_from(lexers_dir).to_s)
end

load load_dir.join('rouge/guesser.rb')
load load_dir.join('rouge/guessers/util.rb')
load load_dir.join('rouge/guessers/glob_mapping.rb')
load load_dir.join('rouge/guessers/modeline.rb')
load load_dir.join('rouge/guessers/filename.rb')
load load_dir.join('rouge/guessers/mimetype.rb')
load load_dir.join('rouge/guessers/source.rb')
load load_dir.join('rouge/guessers/disambiguation.rb')

load load_dir.join('rouge/formatter.rb')
load load_dir.join('rouge/formatters/html.rb')
load load_dir.join('rouge/formatters/html_table.rb')
Expand Down
88 changes: 88 additions & 0 deletions lib/rouge/guessers/disambiguation.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
module Rouge
module Guessers
class Disambiguation < Guesser
include Util
include Lexers

def initialize(filename, source)
@filename = File.basename(filename)
@source = source
end

def filter(lexers)
return lexers if lexers.size == 1
return lexers if lexers.size == Lexer.all.size

@analyzer = TextAnalyzer.new(get_source(@source))

self.class.disambiguators.each do |disambiguator|
next unless disambiguator.match?(@filename)

filtered = disambiguator.decide!(self)
return filtered if filtered
end

return lexers
end

def contains?(text)
return @analyzer.include?(text)
end

def matches?(re)
return !!(@analyzer =~ re)
end

@disambiguators = []
def self.disambiguate(*patterns, &decider)
@disambiguators << Disambiguator.new(patterns, &decider)
end

def self.disambiguators
@disambiguators
end

class Disambiguator
include Util

def initialize(patterns, &decider)
@patterns = patterns
@decider = decider
end

def decide!(guesser)
out = guesser.instance_eval(&@decider)
case out
when Array then out
when nil then nil
else [out]
end
end

def match?(filename)
@patterns.any? { |p| test_glob(p, filename) }
end
end

disambiguate '*.pl' do
next Perl if contains?('my $')
next Prolog if contains?(':-')
next Prolog if matches?(/\A\w+(\(\w+\,\s*\w+\))*\./)
end

disambiguate '*.h' do
next ObjectiveC if matches?(/@(end|implementation|protocol|property)\b/)
next ObjectiveC if contains?('@"')

C
end

disambiguate '*.m' do
next ObjectiveC if matches?(/@(end|implementation|protocol|property)\b/)
next ObjectiveC if contains?('@"')

next Matlab if matches?(/^\s*?%/)
end
end
end
end
9 changes: 3 additions & 6 deletions lib/rouge/guessers/glob_mapping.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ module Guessers
# This class allows for custom behavior
# with glob -> lexer name mappings
class GlobMapping < Guesser
include Util

def self.by_pairs(mapping, filename)
glob_map = {}
mapping.each do |(glob, lexer_name)|
Expand All @@ -29,18 +31,13 @@ def filter(lexers)

collect_best(lexers) do |lexer|
score = (@glob_map[lexer.name] || []).map do |pattern|
if test_pattern(pattern, basename)
if test_glob(pattern, basename)
# specificity is better the fewer wildcards there are
-pattern.scan(/[*?\[]/).size
end
end.compact.min
end
end

private
def test_pattern(pattern, path)
File.fnmatch?(pattern, path, File::FNM_DOTMATCH | File::FNM_CASEFOLD)
end
end
end
end
22 changes: 6 additions & 16 deletions lib/rouge/guessers/source.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
module Rouge
module Guessers
class Source < Guesser
include Util

attr_reader :source
def initialize(source)
@source = source
Expand All @@ -11,27 +13,15 @@ def filter(lexers)
# we've already filtered to 1
return lexers if lexers.size == 1

# If we're filtering against *all* lexers, we only use confident return
# values from analyze_text. But if we've filtered down already, we can trust
# the analysis more.
threshold = lexers.size < 10 ? 0 : 0.5

source_text = case @source
when String
@source
when ->(s){ s.respond_to? :read }
@source.read
else
raise 'invalid source'
end
source_text = get_source(@source)

Lexer.assert_utf8!(source_text)

source_text = TextAnalyzer.new(source_text)

collect_best(lexers, threshold: threshold) do |lexer|
next unless lexer.methods(false).include? :analyze_text
lexer.analyze_text(source_text)
collect_best(lexers) do |lexer|
next unless lexer.methods(false).include? :detect?
lexer.detect?(source_text) ? 1 : nil
end
end
end
Expand Down
20 changes: 20 additions & 0 deletions lib/rouge/guessers/util.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
module Rouge
module Guessers
module Util
def test_glob(pattern, path)
File.fnmatch?(pattern, path, File::FNM_DOTMATCH | File::FNM_CASEFOLD)
end

def get_source(source)
case source
when String
source
when ->(s){ s.respond_to? :read }
source.read
else
raise 'invalid source'
end
end
end
end
end
13 changes: 6 additions & 7 deletions lib/rouge/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def guesses(info={})
guessers << Guessers::Filename.new(filename) if filename
guessers << Guessers::Modeline.new(source) if source
guessers << Guessers::Source.new(source) if source
guessers << Guessers::Disambiguation.new(filename, source) if source && filename

Guesser.guess(guessers, Lexer.all)
end
Expand All @@ -148,7 +149,7 @@ def guesses(info={})
# fails, will be searched for shebangs, <!DOCTYPE ...> tags, and
# other hints.
#
# @see Lexer.analyze_text
# @see Lexer.detect?
# @see Lexer.guesses
def guess(info={})
lexers = guesses(info)
Expand Down Expand Up @@ -425,16 +426,14 @@ def stream_tokens(stream, &b)

# @abstract
#
# Return a number between 0 and 1 indicating the likelihood that
# the text given should be lexed with this lexer. The default
# implementation returns 0. Values under 0.5 will only be used
# to disambiguate filename or mimetype matches.
# Return true if there is an in-text indication (such as a shebang
# or DOCTYPE declaration) that this lexer should be used.
#
# @param [TextAnalyzer] text
# the text to be analyzed, with a couple of handy methods on it,
# like {TextAnalyzer#shebang?} and {TextAnalyzer#doctype?}
def self.analyze_text(text)
0
def self.detect?(text)
false
end
end

Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/apiblueprint.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ class APIBlueprint < Markdown
filenames '*.apib'
mimetypes 'text/vnd.apiblueprint'

def self.analyze_text(text)
return 1 if text.start_with?('FORMAT: 1A\n')
end

prepend :root do
# Metadata
rule(/(\S+)(:\s*)(.*)$/) do
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/awk.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ class Awk < RegexLexer
filenames '*.awk'
mimetypes 'application/x-awk'

def self.analyze_text(text)
return 1 if text.shebang?('awk')
def self.detect?(text)
return true if text.shebang?('awk')
end

id = /[$a-zA-Z_][a-zA-Z0-9_]*/
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/biml.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ class BIML < XML
tag 'biml'
filenames '*.biml'

def self.analyze_text(text)
return 1 if text =~ /<\s*Biml\b/
def self.detect?(text)
return true if text =~ /<\s*Biml\b/
end

prepend :root do
Expand Down
5 changes: 0 additions & 5 deletions lib/rouge/lexers/c.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,6 @@ def self.reserved
)
end

# high priority for filename matches
def self.analyze_text(*)
0.3
end

def self.builtins
@builtins ||= []
end
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/coffeescript.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ class Coffeescript < RegexLexer
title "CoffeeScript"
desc 'The Coffeescript programming language (coffeescript.org)'

def self.analyze_text(text)
return 1 if text.shebang? 'coffee'
def self.detect?(text)
return true if text.shebang? 'coffee'
end

def self.keywords
Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/coq.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@ class Coq < RegexLexer
tag 'coq'
mimetypes 'text/x-coq'

def self.analyze_text(text)
return 0.3 if text.include? "Require"
end

def self.gallina
@gallina ||= Set.new %w(
as fun if in let match then else return end Type Set Prop
Expand Down
8 changes: 4 additions & 4 deletions lib/rouge/lexers/diff.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ class Diff < RegexLexer
filenames '*.diff', '*.patch'
mimetypes 'text/x-diff', 'text/x-patch'

def self.analyze_text(text)
return 1 if text.start_with?('Index: ')
return 1 if text.start_with?('diff ')
return 0.9 if text.start_with?('--- ')
def self.detect?(text)
return true if text.start_with?('Index: ')
return true if text =~ %r(\Adiff[^\n]*?\ba/[^\n]*\bb/)
return true if text =~ /(---|[+][+][+]).*?\n(---|[+][+][+])/
end

state :root do
Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/digdag.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ class Digdag < YAML

mimetypes 'application/x-digdag'

def self.analyze_text(text)
# disable YAML.analyze_text
end

# http://docs.digdag.io/operators.html
# as of digdag v0.9.10
KEYWORD_PATTERN = Regexp.union(%w(
Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/erb.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ class ERB < TemplateLexer

filenames '*.erb', '*.erubis', '*.rhtml', '*.eruby'

def self.analyze_text(text)
return 0.4 if text =~ /<%.*%>/
end

def initialize(opts={})
@ruby_lexer = Ruby.new(opts)

Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/erlang.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ class Erlang < RegexLexer

mimetypes 'text/x-erlang', 'application/x-erlang'

def self.analyze_text(text)
return 0.3 if text =~ /^-module[(]\w+[)][.]/
end

keywords = %w(
after begin case catch cond end fun if
let of query receive try when
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/factor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ class Factor < RegexLexer
filenames '*.factor'
mimetypes 'text/x-factor'

def self.analyze_text(text)
return 1 if text.shebang? 'factor'
def self.detect?(text)
return true if text.shebang? 'factor'
end

def self.builtins
Expand Down
4 changes: 2 additions & 2 deletions lib/rouge/lexers/gherkin.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class Gherkin < RegexLexer
filenames '*.feature'
mimetypes 'text/x-gherkin'

def self.analyze_text(text)
return 1 if text.shebang? 'cucumber'
def self.detect?(text)
return true if text.shebang? 'cucumber'
end

# self-modifying method that loads the keywords file
Expand Down
4 changes: 0 additions & 4 deletions lib/rouge/lexers/go.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ class Go < RegexLexer

mimetypes 'text/x-go', 'application/x-go'

def self.analyze_text(text)
return 0
end

# Characters

WHITE_SPACE = /[\s\t\r\n]+/
Expand Down
Loading

0 comments on commit 5be8d6e

Please sign in to comment.