1 # Copyright (C) 2019-2021 all contributors <meta@public-inbox.org>
2 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
4 # I have no idea how stable or safe this is for handling untrusted
5 # input, but it seems to have been around for a while, and the
6 # highlight(1) executable is supported by gitweb and cgit.
8 # I'm also unsure about API stability, but highlight 3.x seems to
9 # have been around a few years and ikiwiki (apparently the only
10 # user of the SWIG/Perl bindings, at least in Debian) hasn't needed
11 # major changes to support it in recent years.
13 # Some code stolen from ikiwiki (GPL-2.0+)
14 # wrapper for SWIG-generated highlight.pm bindings
15 package PublicInbox::HlMod;
18 use highlight; # SWIG-generated stuff
19 use PublicInbox::Hval qw(src_escape ascii_html);
22 sub _parse_filetypes ($) {
23 my $ft_conf = $_[0]->getFiletypesConfPath('filetypes') or
24 die 'filetypes.conf not found by highlight';
25 open my $fh, '<', $ft_conf or die "failed to open($ft_conf): $!";
29 my @shebang; # order matters
31 # Hrm... why isn't this exposed by the highlight API?
32 # highlight >= 3.2 format (bind-style) (from ikiwiki)
33 while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+
34 Extensions\s*=\s*{([^}]+)}/sgx) {
36 foreach my $bit (split(/,/, $2)) {
37 $bit =~ s/.*"(.*)".*/$1/s;
38 $ext2lang{$bit} = $lang;
41 # AFAIK, all the regexps used by in filetypes.conf distributed
42 # by highlight work as Perl REs
43 while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+
44 Shebang\s*=\s*\[\s*\[([^}]+)\s*\]\s*\]\s*}\s*,/sgx) {
45 my ($lang, $re) = ($1, $2);
47 my $perl_re = qr/$re/;
48 push @shebang, [ $lang, $perl_re ];
51 warn "$lang shebang=[[$re]] did not work in Perl: $@";
54 (\%ext2lang, \@shebang);
57 # We only need one instance
61 my $dir = highlight::DataDir->new;
62 $dir->initSearchDirectories('');
63 my ($ext2lang, $shebang) = _parse_filetypes($dir);
66 -ext2lang => $ext2lang,
72 sub _shebang2lang ($$) {
73 my ($self, $str) = @_;
74 my $shebang = $self->{-shebang};
75 foreach my $s (@$shebang) {
76 return $s->[0] if $$str =~ $s->[1];
82 my ($self, $path) = @_;
83 my ($ext) = ($path =~ m!([^\\/\.]+)\z!);
85 $self->{-ext2lang}->{$ext} || $ext;
89 my ($self, $str, $path) = @_;
90 my $lang = _path2lang($self, $path) if defined $path;
91 do_hl_lang($self, $str, $lang);
95 my ($self, $str, $lang) = @_;
99 $langpath = $self->{-dir}->getLangPath("$lang.lang") or return;
100 undef $lang unless -f $langpath;
102 $lang //= _shebang2lang($self, $str) // return;
103 $langpath = $self->{-dir}->getLangPath("$lang.lang") or return;
104 return unless -f $langpath;
106 my $g = highlight::CodeGenerator::getInstance($highlight::HTML);
107 $g->setFragmentCode(1); # generate html fragment
109 # whatever theme works
110 $g->initTheme($self->{-dir}->getThemePath('print.theme'));
111 $g->loadLanguage($langpath);
112 $g->setEncoding('utf-8');
113 # we assume $$str is valid UTF-8, but the SWIG binding doesn't
114 # know that, so ensure it's marked as UTF-8 even if it isnt...
115 my $out = $g->generateString($$str);
116 highlight::CodeGenerator::deleteInstance($g);
122 # Highlight text, but support Markdown "```$LANG" notation
123 # while preserving WYSIWYG of plain-text documentation.
124 # This is NOT to be enabled by default or encouraged for parsing
125 # emails, since it is NOT stable and can lead to standards
126 # proliferation of email.
128 my ($self, $str) = @_;
130 $$str = join('', map {
131 if (/\A(``` ?)(\w+)\s*?\n(.+)(^```\s*)\z/sm) {
132 my ($pfx, $lang, $code, $post) = ($1, $2, $3, $4);
133 my $hl = do_hl_lang($self, \$code, $lang) || \$code;
134 $pfx . $lang . "\n" . $$hl . $post;
138 } split(/(^``` ?\w+\s*?\n.+?^```\s*$)/sm, $$str));