lib/PublicInbox/OverIdx.pm | 7 ++++--- lib/PublicInbox/Smsg.pm | 2 ++ diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 2e3d4534f125d92383f3e05e03fcf0ad10b6e3a1..0c8a4d9ee3f846d67cd514a41e1cbe8e916950be 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -243,12 +243,13 @@ } $tid; } -# normalize subjects so they are suitable as pathnames for URLs -# XXX: consider for removal +# normalize subjects somewhat, they used to be ASCII-only but now +# we use \w for UTF-8 support. We may still drop it entirely and +# rely on Xapian for subject matches... sub subject_path ($) { my ($subj) = @_; $subj = subject_normalized($subj); - $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g; + $subj =~ s![^\w\.~/\-]+!_!g; lc($subj); } diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index da8ce590991a3e40522bcab2bc0c561d5aa5683f..fb28eff7326e06d19e05cc1a096fc55cf54b3bdd 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -145,6 +145,8 @@ } our $REPLY_RE = qr/^re:\s+/i; +# TODO: see RFC 5256 sec 2.1 "Base Subject" and evaluate compatibility +# w/ existing indices... sub subject_normalized ($) { my ($subj) = @_; $subj =~ s/\A\s+//s; # no leading space