X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FLinkify.pm;h=306a57e77c73d03b99a4ddfd99f98f6119e457db;hb=HEAD;hp=d4df689e495ad8fd5babfec0989415c10a7e4ad3;hpb=db07ffcc3243a19ff5b6edf7f3bfc19cb7460df0;p=public-inbox.git diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index d4df689e..306a57e7 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2016 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # two-step linkification. @@ -11,57 +11,114 @@ # Maybe this could be done more efficiently... package PublicInbox::Linkify; use strict; -use warnings; -use Digest::SHA qw/sha1_hex/; +use v5.10.1; +use PublicInbox::SHA qw(sha1_hex); +use PublicInbox::Hval qw(ascii_html mid_href); +use PublicInbox::MID qw($MID_EXTRACT); my $SALT = rand; -my $LINK_RE = qr{\b((?:ftps?|https?|nntps?|gopher):// - [\@:\w\.-]+/ - ?[!,:~\$\@\w\+\&\?\.\%\;/#=-]*)}x; +my $LINK_RE = qr{([\('!])?\b((?:ftps?|https?|nntps?|imaps?|s?news|gopher):// + [\@:\w\.-]+(?:/ + (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*) + (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)? + (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)? + )? + )}xi; -sub new { bless {}, shift } +sub new { bless {}, $_[0] } + +# try to distinguish paired punctuation chars from the URL itself +# Maybe other languages/formats can be supported here, too... +my %pairs = ( + "(" => qr/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays) + "'" => qr/('[\.,;\+]?)\z/, # Perl / Ruby + "!" => qr/(![\.,;\+]?)\z/, # Perl / Ruby +); sub linkify_1 { - my ($self, $s) = @_; - $s =~ s!$LINK_RE! - my $url = $1; + $_[1] =~ s^$LINK_RE^ + my $beg = $1 || ''; + my $url = $2; my $end = ''; # it's fairly common to end URLs in messages with # '.', ',' or ';' to denote the end of a statement; # assume the intent was to end the statement/sentence # in English - if ($url =~ s/([\.,;])\z//) { - $end = $1; + if (defined(my $re = $pairs{$beg})) { + if ($url =~ s/$re//) { + $end = $1; + } + } elsif ($url =~ s/(\))?([\.,;])\z//) { + $end = $2; + # require ')' to be paired with '(' + if (defined $1) { # ')' + if (index($url, '(') < 0) { + $end = ")$end"; + } else { + $url .= ')'; + } + } + } elsif ($url !~ /\(/ && $url =~ s/\)\z//) { + $end = ')'; } + $url = ascii_html($url); # for IDN + # salt this, as this could be exploited to show # links in the HTML which don't show up in the raw mail. my $key = sha1_hex($url . $SALT); - - # only escape ampersands, others do not match LINK_RE - $url =~ s/&/&/g; - $self->{$key} = $url; - 'PI-LINK-'. $key . $end; - !ge; - $s; + $key =~ tr/0-9/A-J/; # no digits for YAML highlight + $_[0]->{$key} = $url; + $beg . 'LINKIFY' . $key . $end; + ^geo; + $_[1]; } sub linkify_2 { - my ($self, $s) = @_; - - # Added "PI-LINK-" prefix to avoid false-positives on git commits - $s =~ s!\bPI-LINK-([a-f0-9]{40})\b! + # Added "LINKIFY" prefix to avoid false-positives on git commits + $_[1] =~ s!\bLINKIFY([a-fA-J]{40})\b! my $key = $1; - my $url = $self->{$key}; + my $url = $_[0]->{$key}; if (defined $url) { "$url"; - } else { - # false positive or somebody tried to mess with us - $key; + } else { # false positive or somebody tried to mess with us + 'LINKIFY'.$key; + } + !ge; + $_[1]; +} + +# single pass linkification of within $str +# with $pfx being the URL prefix +sub linkify_mids { + my ($self, $pfx, $str, $raw) = @_; + $$str =~ s!$MID_EXTRACT! + my $mid = $1; + my $html = ascii_html($mid); + my $href = mid_href($mid); + + # salt this, as this could be exploited to show + # links in the HTML which don't show up in the raw mail. + my $key = sha1_hex($html . $SALT); + $key =~ tr/0-9/A-J/; + my $repl = qq(<$html>); + $repl .= qq{ (raw)} if $raw; + $self->{$key} = $repl; + 'LINKIFY'.$key; + !ge; + $$str = ascii_html($$str); + $$str =~ s!\bLINKIFY([a-fA-J]{40})\b! + my $key = $1; + my $repl = $_[0]->{$key}; + if (defined $repl) { + $repl; + } else { # false positive or somebody tried to mess with us + 'LINKIFY'.$key; } !ge; - $s; } +sub to_html { linkify_2($_[0], ascii_html(linkify_1(@_))) } + 1;