summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdmin <admin@marx.cafe>2022-11-26 02:24:21 -0500
committerAdmin <admin@marx.cafe>2022-11-26 02:24:21 -0500
commit4c2ac978c0ce038725ddcfabe32a3d2e0da8dfde (patch)
tree16adfc771409e84e04c0cda635c2e2063948c2e6
parent516e3a1f3398cb032645b3914615ccd6d15d4bc4 (diff)
Add translation script
-rwxr-xr-xhtml2md.rb101
1 files changed, 101 insertions, 0 deletions
diff --git a/html2md.rb b/html2md.rb
new file mode 100755
index 0000000..0ba7f55
--- /dev/null
+++ b/html2md.rb
@@ -0,0 +1,101 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require 'bundler/inline'
+
+gemfile do
+ source 'https://rubygems.org'
+ gem 'nokogiri', '~> 1.13'
+end
+
+if ARGV[0].nil?
+ puts 'No input'
+ puts 'Usage: change <input file>'
+ exit 1
+end
+
+filename = ARGV[0]
+file_no_ext = File.basename(filename, File.extname(filename))
+content = File.read(filename)
+doc = Nokogiri.parse(content)
+
+title = doc.title
+body = doc.at_css('body')
+# Remove empty links and paragraphs
+body.css('a').each { |a| a.remove if a.inner_text.gsub(/[[:space:]]/, '').empty? }
+body.css('p').each { |p| p.remove if p.inner_text.gsub(/[[:space:]]/, '').empty? }
+# Footnote targets
+body.css('span.info > a[name]').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('name')}]: ") }
+body.css('.information > sup.enote > a[name]').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('name')}]: ") }
+# Replace footnote references
+body.css('sup.enote > a').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('href')[1..]}]") }
+body.css('sup.anote > a').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('href')[1..]}]") }
+# Replace inote (not sure) text with italics. Ideally this would be
+# subdued but markdown isn't fancy enough
+body.css('span.inote').each { |span| span.replace("*#{span.inner_text}*") }
+# Replace anchor tags with markdown links
+body.css('a').each do |a|
+ link = a['href']
+ # Anchors without href, but with content
+ unless link
+ a.replace(a.inner_text)
+ next
+ end
+ # Don't break glossary links
+ if link['/glossary/']
+ link = link.sub(%r{.*?/(glossary.*)}, 'https://marxists.org/\1') if link['/glossary/']
+ else
+ link = "#TODO;#{link}"
+ end
+ text = a.inner_text
+ a.replace "[#{text}](#{link})"
+end
+# Replace italics
+body.css('i').each { |i| i.replace("*#{i.inner_text}*") }
+body.css('em').each { |i| i.replace("*#{i.inner_text}*") }
+# Remove horizontal rules
+body.css('hr').remove
+# Remove carriage returns
+body.search('//text()').each do |text|
+ text.replace(text.text.gsub("\r", ''))
+end
+# Convert html heading to markdown heading (usage very inconsistent)
+# Document title
+body.css('h1').each { |heading| heading.replace("# #{heading.inner_html.strip}") }
+# Author's name (sometimes?)
+body.css('h2').each { |heading| heading.replace("### #{heading.inner_html.strip}") }
+# Chapter title
+body.css('h3').each { |heading| heading.replace("## #{heading.inner_html.strip}") }
+# Section title
+body.css('h4').each { |heading| heading.replace("### #{heading.inner_html.strip}") }
+# Subsections
+body.css('h5').each { |heading| heading.replace("#### #{heading.inner_html.strip}") }
+body.css('h6').each { |heading| heading.replace("##### #{heading.inner_html.strip}") }
+body.css('h7').each { |heading| heading.replace("###### #{heading.inner_html.strip}") }
+
+# Remove footer
+body.css('.footer').remove
+# Convert paragraphs to plain text
+body.css('p').each do |p|
+ text = p.inner_html.gsub("\n", ' ').strip
+ p_class = p['class']
+ comment = "<!-- class: #{p_class} -->\n" if p_class && p_class != 'information'
+ p.replace("#{comment}#{text}\n\n")
+end
+# Remove line breaks, done after removing <p> to not get newlines erased
+body.css('br').each { |br| br.replace("\n\n") }
+body.css('span.context').each { |span| span.replace("<!-- #{span.attr('class')} --> *#{span.inner_html}*") }
+
+puts '---'
+puts "title: #{title}"
+puts 'date: 2022-11-18'
+puts 'draft: true'
+puts '---'
+
+body_text = if body.at_css('blockquote > div.border')
+ body.at_css('blockquote > div.border').children.to_html
+ else
+ body.children.to_html
+ end
+
+puts body_text.lines.map(&:strip).join("\n").gsub(/\n\n\n+/, "\n\n")