# Copyright (C) 2019-2021 all contributors
# License: AGPL-3.0+
# I have no idea how stable or safe this is for handling untrusted
# input, but it seems to have been around for a while, and the
# highlight(1) executable is supported by gitweb and cgit.
#
# I'm also unsure about API stability, but highlight 3.x seems to
# have been around a few years and ikiwiki (apparently the only
# user of the SWIG/Perl bindings, at least in Debian) hasn't needed
# major changes to support it in recent years.
#
# Some code stolen from ikiwiki (GPL-2.0+)
# wrapper for SWIG-generated highlight.pm bindings
package PublicInbox::HlMod;
use strict;
use v5.10.1;
use highlight; # SWIG-generated stuff
use PublicInbox::Hval qw(src_escape ascii_html);
my $hl;
sub _parse_filetypes ($) {
my $ft_conf = $_[0]->getFiletypesConfPath('filetypes') or
die 'filetypes.conf not found by highlight';
open my $fh, '<', $ft_conf or die "failed to open($ft_conf): $!";
local $/;
my $cfg = <$fh>;
my %ext2lang;
my @shebang; # order matters
# Hrm... why isn't this exposed by the highlight API?
# highlight >= 3.2 format (bind-style) (from ikiwiki)
while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+
Extensions\s*=\s*{([^}]+)}/sgx) {
my $lang = $1;
foreach my $bit (split(/,/, $2)) {
$bit =~ s/.*"(.*)".*/$1/s;
$ext2lang{$bit} = $lang;
}
}
# AFAIK, all the regexps used by in filetypes.conf distributed
# by highlight work as Perl REs
while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+
Shebang\s*=\s*\[\s*\[([^}]+)\s*\]\s*\]\s*}\s*,/sgx) {
my ($lang, $re) = ($1, $2);
eval {
my $perl_re = qr/$re/;
push @shebang, [ $lang, $perl_re ];
};
if ($@) {
warn "$lang shebang=[[$re]] did not work in Perl: $@";
}
}
(\%ext2lang, \@shebang);
}
# We only need one instance
sub new {
my ($class) = @_;
$hl ||= do {
my $dir = highlight::DataDir->new;
$dir->initSearchDirectories('');
my ($ext2lang, $shebang) = _parse_filetypes($dir);
bless {
-dir => $dir,
-ext2lang => $ext2lang,
-shebang => $shebang,
}, $class;
};
}
sub _shebang2lang ($$) {
my ($self, $str) = @_;
my $shebang = $self->{-shebang};
foreach my $s (@$shebang) {
return $s->[0] if $$str =~ $s->[1];
}
undef;
}
sub _path2lang ($$) {
my ($self, $path) = @_;
my ($ext) = ($path =~ m!([^\\/\.]+)\z!);
$ext = lc($ext);
$self->{-ext2lang}->{$ext} || $ext;
}
sub do_hl {
my ($self, $str, $path) = @_;
my $lang = _path2lang($self, $path) if defined $path;
do_hl_lang($self, $str, $lang);
}
sub do_hl_lang {
my ($self, $str, $lang) = @_;
my $langpath;
if (defined $lang) {
$langpath = $self->{-dir}->getLangPath("$lang.lang") or return;
undef $lang unless -f $langpath;
}
$lang //= _shebang2lang($self, $str) // return;
$langpath = $self->{-dir}->getLangPath("$lang.lang") or return;
return unless -f $langpath;
my $g = highlight::CodeGenerator::getInstance($highlight::HTML);
$g->setFragmentCode(1); # generate html fragment
# whatever theme works
$g->initTheme($self->{-dir}->getThemePath('print.theme'));
$g->loadLanguage($langpath);
$g->setEncoding('utf-8');
# we assume $$str is valid UTF-8, but the SWIG binding doesn't
# know that, so ensure it's marked as UTF-8 even if it isnt...
my $out = $g->generateString($$str);
highlight::CodeGenerator::deleteInstance($g);
utf8::decode($out);
src_escape($out);
\$out;
}
# Highlight text, but support Markdown "```$LANG" notation
# while preserving WYSIWYG of plain-text documentation.
# This is NOT to be enabled by default or encouraged for parsing
# emails, since it is NOT stable and can lead to standards
# proliferation of email.
sub do_hl_text {
my ($self, $str) = @_;
$$str = join('', map {
if (/\A(``` ?)(\w+)\s*?\n(.+)(^```\s*)\z/sm) {
my ($pfx, $lang, $code, $post) = ($1, $2, $3, $4);
my $hl = do_hl_lang($self, \$code, $lang) || \$code;
$pfx . $lang . "\n" . $$hl . $post;
} else {
ascii_html($_);
}
} split(/(^``` ?\w+\s*?\n.+?^```\s*$)/sm, $$str));
}
1;