/ common / mixed_content_parser.rb
mixed_content_parser.rb
 1  require 'java'
 2  
 3  module MixedContentParser
 4  
 5    def self.parse(content, base_uri, opts = {} )
 6      opts[:pretty_print] ||= false
 7  
 8      return if content.nil?
 9  
10      content.strip!
11      content.chomp!
12  
13      return '' if content.empty?
14  
15      # create an empty document just to get an outputSettings object
16      # (seems like the API falls down when we do this directly...)
17      d = org.jsoup.Jsoup.parse("")
18      d.outputSettings.prettyPrint(opts[:pretty_print])
19  
20      # archon does things differently.....
21      content.gsub!("\n\t", "\n\n")
22  
23      # transform blocks of text seperated by line breaks into <p> wrapped blocks
24      content = content.split("\n\n").inject("") { |c, n| c << "<p>#{n}</p>" } if opts[:wrap_blocks]
25  
26      whitelist = org.jsoup.safety.Whitelist.relaxed
27                                            .addTags("emph", "lb", "title", "unitdate")
28                                            .addAttributes("emph", "render")
29                                            .addAttributes("title", "render")
30                                            .addAttributes("unitdate", "render")
31  
32      cleaned_content = org.jsoup.Jsoup.clean(content, base_uri, whitelist, d.outputSettings())
33  
34      document = org.jsoup.Jsoup.parse(cleaned_content, base_uri, org.jsoup.parser.Parser.xmlParser())
35      document.outputSettings.escapeMode(Java::OrgJsoupNodes::Entities::EscapeMode.xhtml)
36      document.outputSettings.prettyPrint(opts[:pretty_print])
37  
38      # replace lb with br
39      document.select("lb").tagName("br")
40  
41      # tweak the emph tags
42      [ "emph", "title", "unitdate"  ].each do |tag|
43        document.select(tag).each do |emph|
44          # make all emph's a span
45          emph.tagName("span")
46  
47          # <emph> should render as <em> if there is no @render attribute. If there is, render as follows:
48          if emph.attr("render").empty?
49            emph.attr("class", "emph render-none")
50  
51          # render="nonproport": <code>
52          elsif emph.attr("render") === "nonproport"
53            emph.attr("class", "emph render-#{emph.attr("render")}")
54            emph.tagName("code")
55            emph.removeAttr("render")
56  
57          # set a class so CSS can style based on the render value
58          else
59            emph.attr("class", "emph render-#{emph.attr("render")}")
60            emph.removeAttr("render")
61          end
62        end
63      end
64      document.toString()
65    end
66  
67  end