X-Git-Url: http://www.git.stargrave.org/?a=blobdiff_plain;f=lib%2FPublicInbox%2FSmsg.pm;h=b132381b4ab2c3a93fd78b1fe042d1710a2c3fe8;hb=5198c976ce8b1954f0f76a0da152cc434411f147;hp=aaf88f355e60952291226aefba9ae970e369a609;hpb=08e8e9522405e40985935b13e9eaaf409e33da75;p=public-inbox.git diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index aaf88f35..b132381b 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors +# Copyright (C) 2015-2021 all contributors # License: AGPL-3.0+ # # A small/skeleton/slim representation of a message. @@ -9,19 +9,14 @@ # large threads in our WWW UI and the NNTP range responses. package PublicInbox::Smsg; use strict; -use warnings; -use base qw(Exporter); +use v5.10.1; +use parent qw(Exporter); our @EXPORT_OK = qw(subject_normalized); -use PublicInbox::MID qw(mids); +use PublicInbox::MID qw(mids references); use PublicInbox::Address; use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -use Time::Local qw(timegm); -sub get_val ($$) { - my ($doc, $col) = @_; - # sortable_unserialise is defined by PublicInbox::Search::load_xapian() - sortable_unserialise($doc->get_value($col)); -} +sub oidbin { pack('H*', $_[0]->{blob}) } sub to_doc_data { my ($self) = @_; @@ -40,6 +35,7 @@ sub to_doc_data { sub load_from_data ($$) { my ($self) = $_[0]; # data = $_[1] + utf8::decode($_[1]); ( $self->{subject}, $self->{from}, @@ -60,41 +56,39 @@ sub load_from_data ($$) { ) = split(/\n/, $_[1]); } -sub load_expand { - my ($self, $doc) = @_; - my $data = $doc->get_data or return; - $self->{ts} = get_val($doc, PublicInbox::Search::TS()); - my $dt = get_val($doc, PublicInbox::Search::DT()); - my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); - $self->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); - utf8::decode($data); - load_from_data($self, $data); - $self; -} - sub psgi_cull ($) { my ($self) = @_; - # ghosts don't have ->{from} - my $from = delete($self->{from}) // ''; - my @n = PublicInbox::Address::names($from); - $self->{from_name} = join(', ', @n); - # drop NNTP-only fields which aren't relevant to PSGI results: # saves ~80K on a 200 item search result: - delete @$self{qw(ts to cc bytes lines)}; + # TODO: we may need to keep some of these for JMAP... + my ($f) = delete @$self{qw(from tid to cc bytes lines)}; + # ghosts don't have ->{from} + $self->{from_name} = join(', ', PublicInbox::Address::names($f // '')); $self; } -# Only called by PSGI interface, not NNTP -sub from_mitem { - my ($mitem, $srch) = @_; - return $srch->retry_reopen(\&from_mitem, $mitem) if $srch; - my $self = bless {}, __PACKAGE__; - psgi_cull(load_expand($self, $mitem->get_document)); +sub parse_references ($$$) { + my ($smsg, $hdr, $mids) = @_; + my $refs = references($hdr); + push(@$refs, @$mids) if scalar(@$mids) > 1; + return $refs if scalar(@$refs) == 0; + + # prevent circular references here: + my %seen = ( ($smsg->{mid} // '') => 1 ); + my @keep; + foreach my $ref (@$refs) { + if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) { + warn "References: <$ref> too long, ignoring\n"; + next; + } + $seen{$ref} //= push(@keep, $ref); + } + $smsg->{references} = '<'.join('> <', @keep).'>' if @keep; + \@keep; } -# for Import and v1 non-SQLite WWW code paths +# used for v2, Import and v1 non-SQLite WWW code paths sub populate { my ($self, $hdr, $sync) = @_; for my $f (qw(From To Cc Subject)) { @@ -118,13 +112,13 @@ sub populate { $self->{$f} = $val if $val ne ''; } $sync //= {}; - $self->{-ds} = [ my @ds = msg_datestamp($hdr, $sync->{autime}) ]; - $self->{-ts} = [ my @ts = msg_timestamp($hdr, $sync->{cotime}) ]; + my @ds = msg_datestamp($hdr, $sync->{autime} // $self->{ds}); + my @ts = msg_timestamp($hdr, $sync->{cotime} // $self->{ts}); + $self->{-ds} = \@ds; + $self->{-ts} = \@ts; $self->{ds} //= $ds[0]; # no zone $self->{ts} //= $ts[0]; - - # for v1 users w/o SQLite - $self->{mid} //= eval { mids($hdr)->[0] } // ''; + $self->{mid} //= mids($hdr)->[0]; } # no strftime, that is locale-dependent and not for RFC822 @@ -149,6 +143,8 @@ sub internaldate { # for IMAP our $REPLY_RE = qr/^re:\s+/i; +# TODO: see RFC 5256 sec 2.1 "Base Subject" and evaluate compatibility +# w/ existing indices... sub subject_normalized ($) { my ($subj) = @_; $subj =~ s/\A\s+//s; # no leading space @@ -159,4 +155,17 @@ sub subject_normalized ($) { $subj; } +# returns the number of bytes to add if given a non-CRLF arg +sub crlf_adjust ($) { + if (index($_[0], "\r\n") < 0) { + # common case is LF-only, every \n needs an \r; + # so favor a cheap tr// over an expensive m//g + $_[0] =~ tr/\n/\n/; + } else { # count number of '\n' w/o '\r', expensive: + scalar(my @n = ($_[0] =~ m/(?{bytes} = $_[2] + crlf_adjust($_[1]) } + 1;