html2md.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

#!/usr/bin/env ruby
# frozen_string_literal: true

require 'bundler/inline'

gemfile do
  source 'https://rubygems.org'
  gem 'nokogiri', '~> 1.13'
end

if ARGV[0].nil?
  puts 'No input'
  puts 'Usage: change <input file>'
  exit 1
end

filename = ARGV[0]
file_no_ext = File.basename(filename, File.extname(filename))
content = File.read(filename)
doc = Nokogiri.parse(content)

title = doc.title
body = doc.at_css('body')
# Remove empty links and paragraphs
body.css('a').each { |a| a.remove if a.inner_text.gsub(/[[:space:]]/, '').empty? }
body.css('p').each { |p| p.remove if p.inner_text.gsub(/[[:space:]]/, '').empty? }
# Footnote targets
body.css('span.info > a[name]').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('name')}]: ") }
body.css('.information > sup.enote > a[name]').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('name')}]: ") }
# Replace footnote references
body.css('sup.enote > a').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('href')[1..]}]") }
body.css('sup.anote > a').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('href')[1..]}]") }
# Replace inote (not sure) text with italics. Ideally this would be
# subdued but markdown isn't fancy enough
body.css('span.inote').each { |span| span.replace("*#{span.inner_text}*") }
# Replace anchor tags with markdown links
body.css('a').each do |a|
  link = a['href']
  # Anchors without href, but with content
  unless link
    a.replace(a.inner_text)
    next
  end
  # Don't break glossary links
  if link['/glossary/']
    link = link.sub(%r{.*?/(glossary.*)}, 'https://marxists.org/\1') if link['/glossary/']
  else
    link = "#TODO;#{link}"
  end
  text = a.inner_text
  a.replace "[#{text}](#{link})"
end
# Replace italics
body.css('i').each { |i| i.replace("*#{i.inner_text}*") }
body.css('em').each { |i| i.replace("*#{i.inner_text}*") }
# Remove horizontal rules
body.css('hr').remove
# Remove carriage returns
body.search('//text()').each do |text|
  text.replace(text.text.gsub("\r", ''))
end
# Convert html heading to markdown heading (usage very inconsistent)
# Document title
body.css('h1').each { |heading| heading.replace("# #{heading.inner_html.strip}") }
# Author's name (sometimes?)
body.css('h2').each { |heading| heading.replace("### #{heading.inner_html.strip}") }
# Chapter title
body.css('h3').each { |heading| heading.replace("## #{heading.inner_html.strip}") }
# Section title
body.css('h4').each { |heading| heading.replace("### #{heading.inner_html.strip}") }
# Subsections
body.css('h5').each { |heading| heading.replace("#### #{heading.inner_html.strip}") }
body.css('h6').each { |heading| heading.replace("##### #{heading.inner_html.strip}") }
body.css('h7').each { |heading| heading.replace("###### #{heading.inner_html.strip}") }

# Remove footer
body.css('.footer').remove
# Convert paragraphs to plain text
body.css('p').each do |p|
  text = p.inner_html.gsub("\n", ' ').strip
  p_class = p['class']
  comment = "<!-- class: #{p_class} -->\n" if p_class && p_class != 'information'
  p.replace("#{comment}#{text}\n\n")
end
# Remove line breaks, done after removing <p> to not get newlines erased
body.css('br').each { |br| br.replace("\n\n") }
body.css('span.context').each { |span| span.replace("<!-- #{span.attr('class')} --> *#{span.inner_html}*") }

puts '---'
puts "title: #{title}"
puts 'date: 2022-11-18'
puts 'draft: true'
puts '---'

body_text = if body.at_css('blockquote > div.border')
              body.at_css('blockquote > div.border').children.to_html
            else
              body.children.to_html
            end

puts body_text.lines.map(&:strip).join("\n").gsub(/\n\n\n+/, "\n\n")