#!/usr/bin/env ruby # frozen_string_literal: true require 'bundler/inline' gemfile do source 'https://rubygems.org' gem 'nokogiri', '~> 1.13' end if ARGV[0].nil? puts 'No input' puts 'Usage: change ' exit 1 end filename = ARGV[0] file_no_ext = File.basename(filename, File.extname(filename)) content = File.read(filename).encode('UTF-8', invalid: :replace, undef: :replace) doc = Nokogiri.parse(content) title = doc.title body = doc.at_css('body') # Remove empty links and paragraphs body.css('a').each { |a| a.remove if a.inner_text.gsub(/[[:space:]]/, '').empty? } body.css('p').each { |p| p.remove if p.inner_text.gsub(/[[:space:]]/, '').empty? } # Footnote targets body.css('span.info > a[name]').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('name')}]: ") } body.css('.information > sup.enote > a[name]').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('name')}]: ") } # Replace footnote references body.css('sup.enote > a').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('href')[1..]}]") } body.css('sup.anote > a').each { |a| a.parent.replace("[^#{file_no_ext}#{a.attr('href')[1..]}]") } # Replace inote (not sure) text with italics. Ideally this would be # subdued but markdown isn't fancy enough body.css('span.inote').each { |span| span.replace("*#{span.inner_text}*") } # Replace anchor tags with markdown links body.css('a').each do |a| link = a['href'] # Anchors without href, but with content unless link a.replace(a.inner_text) next end # Don't break glossary links if link['/glossary/'] link = link.sub(%r{.*?/(glossary.*)}, 'https://marxists.org/\1') if link['/glossary/'] else link = "#TODO;#{link}" end text = a.inner_text a.replace "[#{text}](#{link})" end # Replace italics body.css('i').each { |i| i.replace("*#{i.inner_text}*") } body.css('em').each { |i| i.replace("*#{i.inner_text}*") } # Remove horizontal rules body.css('hr').remove # Remove carriage returns body.search('//text()').each do |text| text.replace(text.text.gsub("\r", '')) end # Convert html heading to markdown heading (usage very inconsistent) # Document title body.css('h1').each { |heading| heading.replace("# #{heading.inner_html.strip}") } # Author's name (sometimes?) body.css('h2').each { |heading| heading.replace("### #{heading.inner_html.strip}") } # Chapter title body.css('h3').each { |heading| heading.replace("## #{heading.inner_html.strip}") } # Section title body.css('h4').each { |heading| heading.replace("### #{heading.inner_html.strip}") } # Subsections body.css('h5').each { |heading| heading.replace("#### #{heading.inner_html.strip}") } body.css('h6').each { |heading| heading.replace("##### #{heading.inner_html.strip}") } body.css('h7').each { |heading| heading.replace("###### #{heading.inner_html.strip}") } # Remove footer body.css('.footer').remove # Convert paragraphs to plain text body.css('p').each do |p| text = p.inner_html.gsub("\n", ' ').strip p_class = p['class'] comment = "\n" if p_class && p_class != 'information' p.replace("#{comment}#{text}\n\n") end # Remove line breaks, done after removing
to not get newlines erased body.css('br').each { |br| br.replace("\n\n") } body.css('span.context').each { |span| span.replace(" *#{span.inner_html}*") } puts '---' puts "title: \"#{title}\"" puts 'date: 2022-11-18' puts 'draft: true' puts '---' body_text = if body.at_css('blockquote > div.border') body.at_css('blockquote > div.border').children.to_html else body.children.to_html end puts body_text.lines.map(&:strip).join("\n").gsub(/\n\n\n+/, "\n\n")