From 4c2ac978c0ce038725ddcfabe32a3d2e0da8dfde Mon Sep 17 00:00:00 2001 From: Admin Date: Sat, 26 Nov 2022 02:24:21 -0500 Subject: Add translation script --- html2md.rb | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100755 html2md.rb diff --git a/html2md.rb b/html2md.rb new file mode 100755 index 0000000..0ba7f55 --- /dev/null +++ b/html2md.rb @@ -0,0 +1,101 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require 'bundler/inline' + +gemfile do + source 'https://rubygems.org' + gem 'nokogiri', '~> 1.13' +end + +if ARGV[0].nil? + puts 'No input' + puts 'Usage: change ' + exit 1 +end + +filename = ARGV[0] +file_no_ext = File.basename(filename, File.extname(filename)) +content = File.read(filename) +doc = Nokogiri.parse(content) + +title = doc.title +body = doc.at_css('body') +# Remove empty links and paragraphs +body.css('a').each { |a| a.remove if a.inner_text.gsub(/[[:space:]]/, '').empty? } +body.css('p').each { |p| p.remove if p.inner_text.gsub(/[[:space:]]/, '').empty? } +# Footnote targets +body.css('span.info > a[name]').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('name')}]: ") } +body.css('.information > sup.enote > a[name]').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('name')}]: ") } +# Replace footnote references +body.css('sup.enote > a').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('href')[1..]}]") } +body.css('sup.anote > a').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('href')[1..]}]") } +# Replace inote (not sure) text with italics. Ideally this would be +# subdued but markdown isn't fancy enough +body.css('span.inote').each { |span| span.replace("*#{span.inner_text}*") } +# Replace anchor tags with markdown links +body.css('a').each do |a| + link = a['href'] + # Anchors without href, but with content + unless link + a.replace(a.inner_text) + next + end + # Don't break glossary links + if link['/glossary/'] + link = link.sub(%r{.*?/(glossary.*)}, 'https://marxists.org/\1') if link['/glossary/'] + else + link = "#TODO;#{link}" + end + text = a.inner_text + a.replace "[#{text}](#{link})" +end +# Replace italics +body.css('i').each { |i| i.replace("*#{i.inner_text}*") } +body.css('em').each { |i| i.replace("*#{i.inner_text}*") } +# Remove horizontal rules +body.css('hr').remove +# Remove carriage returns +body.search('//text()').each do |text| + text.replace(text.text.gsub("\r", '')) +end +# Convert html heading to markdown heading (usage very inconsistent) +# Document title +body.css('h1').each { |heading| heading.replace("# #{heading.inner_html.strip}") } +# Author's name (sometimes?) +body.css('h2').each { |heading| heading.replace("### #{heading.inner_html.strip}") } +# Chapter title +body.css('h3').each { |heading| heading.replace("## #{heading.inner_html.strip}") } +# Section title +body.css('h4').each { |heading| heading.replace("### #{heading.inner_html.strip}") } +# Subsections +body.css('h5').each { |heading| heading.replace("#### #{heading.inner_html.strip}") } +body.css('h6').each { |heading| heading.replace("##### #{heading.inner_html.strip}") } +body.css('h7').each { |heading| heading.replace("###### #{heading.inner_html.strip}") } + +# Remove footer +body.css('.footer').remove +# Convert paragraphs to plain text +body.css('p').each do |p| + text = p.inner_html.gsub("\n", ' ').strip + p_class = p['class'] + comment = "\n" if p_class && p_class != 'information' + p.replace("#{comment}#{text}\n\n") +end +# Remove line breaks, done after removing

to not get newlines erased +body.css('br').each { |br| br.replace("\n\n") } +body.css('span.context').each { |span| span.replace(" *#{span.inner_html}*") } + +puts '---' +puts "title: #{title}" +puts 'date: 2022-11-18' +puts 'draft: true' +puts '---' + +body_text = if body.at_css('blockquote > div.border') + body.at_css('blockquote > div.border').children.to_html + else + body.children.to_html + end + +puts body_text.lines.map(&:strip).join("\n").gsub(/\n\n\n+/, "\n\n") -- cgit v1.2.3