# # $Id$ - modified by hzqbbc # package Encode::IMAPUTF7; use strict; no warnings qw(prototype redefine); use base qw(Encode::Encoding); __PACKAGE__->Define('IMAP-UTF-7', 'imap-utf-7'); our $VERSION = '1.00'; use MIME::Base64; use Encode; # # Algorithms taken from Unicode::String by Gisle Aas # Code directly borrowed from Encode::Unicode::UTF7 by Dan Kogai # my $specials = quotemeta "!\"#$%'()*+,-./:;<=>?@[\\]^_`{|}~"; # \s will not work because it matches U+3000 DEOGRAPHIC SPACE # We use qr/[\n\r\t\ ] instead my $re_asis = qr/(?:[\n\r\t\ A-Za-z0-9$specials])/; my $re_encoded = qr/(?:[^\n\r\t\ A-Za-z0-9$specials])/; my $e_utf16 = find_encoding("UTF-16BE"); sub needs_lines { 1 }; sub encode($$;$){ my ($obj, $str, $chk) = @_; my $len = length($str); pos($str) = 0; my $bytes = ''; while (pos($str) < $len){ if ($str =~ /\G($re_asis+)/ogc){ $bytes .= $1; }elsif($str =~ /\G($re_encoded+)/ogsc){ if ($1 eq "&"){ $bytes .= "&-"; }else{ my $base64 = encode_base64($e_utf16->encode($1), ''); $base64 =~ s/=+$//; $base64 =~ s/\//,/g; $bytes .= "&$base64-"; } }else{ die "This should not happen! (pos=" . pos($str) . ")"; } } $_[1] = '' if $chk; return $bytes; } sub decode{ my ($obj, $bytes, $chk) = @_; my $len = length($bytes); my $str = ""; while (pos($bytes) < $len) { if ($bytes =~ /\G([^&]+)/ogc) { $str .= $1; }elsif($bytes =~ /\G\&-/ogc) { $str .= "&"; }elsif($bytes =~ /\G\&([A-Za-z0-9+,]+)-?/ogsc) { my $base64 = $1; $base64 =~ s/,/\//g; my $pad = length($base64) % 4; $base64 .= "=" x (4 - $pad) if $pad; $str .= $e_utf16->decode(decode_base64($base64)); }elsif($bytes =~ /\G\+/ogc) { $^W and warn "Bad UTF7 data escape"; $str .= "+"; }else{ die "This should not happen " . pos($bytes); } } $_[1] = '' if $chk; return $str; } 1; __END__ =head1 NAME Encode::IMAPUTF7 - modification of UTF-7 encoding for IMAP =head1 SYNOPSIS use Encode qw/encode decode/; print encode('IMAP-UTF-7', 'Répertoire'); print decode('IMAP-UTF-7', R&AOk-pertoire'); =head1 ABSTRACT IMAP mailbox names are encoded in a modified UTF7 when names contains international characters outside of the printable ASCII range. The modified UTF-7 encoding is defined in RFC2060 (section 5.1.3). There is another CPAN module with same purpose, Unicode::IMAPUtf7. However, it works correctly only with strings, which encoded form does not contain plus sign. For example, the Cyrillic string \x{043f}\x{0440}\x{0435}\x{0434}\x{043b}\x{043e}\x{0433} is represented in UTF-7 as +BD8EQAQ1BDQEOwQ+BDM- Note the second plus sign 4 characters before the end. Unicode::IMAPUtf7 encodes the above string as +BD8EQAQ1BDQEOwQ&BDM- which is not valid modified UTF-7 (the ampersand and the plus are swapped). The problem is solved by the current module, which is slightly modified Encode::Unicode::UTF7 and has nothing common with Unicode::IMAPUtf7. =head1 RFC2060 - section 5.1.3 - Mailbox International Naming Convention By convention, international mailbox names are specified using a modified version of the UTF-7 encoding described in [UTF-7]. The purpose of these modifications is to correct the following problems with UTF-7: 1) UTF-7 uses the "+" character for shifting; this conflicts with the common use of "+" in mailbox names, in particular USENET newsgroup names. 2) UTF-7's encoding is BASE64 which uses the "/" character; this conflicts with the use of "/" as a popular hierarchy delimiter. 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with the use of "\" as a popular hierarchy delimiter. 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with the use of "~" in some servers as a home directory indicator. 5) UTF-7 permits multiple alternate forms to represent the same string; in particular, printable US-ASCII chararacters can be represented in encoded form. In modified UTF-7, printable US-ASCII characters except for "&" represent themselves; that is, characters with octet values 0x20-0x25 and 0x27-0x7e. The character "&" (0x26) is represented by the two- octet sequence "&-". All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all Unicode 16-bit octets) are represented in modified BASE64, with a further modification from [UTF-7] that "," is used instead of "/". Modified BASE64 MUST NOT be used to represent any printing US-ASCII character which can represent itself. "&" is used to shift to modified BASE64 and "-" to shift back to US- ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that is, a name that ends with a Unicode 16-bit octet MUST end with a "- "). For example, here is a mailbox name which mixes English, Japanese, and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw- =head1 REQUESTS & BUGS Please report any requests, suggestions or bugs via the RT bug-tracking system at http://rt.cpan.org/ or email to bug-Encode-IMAPUTF7@rt.cpan.org. http://rt.cpan.org/NoAuth/Bugs.html?Dist=Encode-IMAPUTF7 is the RT queue for Encode::IMAPUTF7. Please check to see if your bug has already been reported. =head1 COPYRIGHT Copyright 2005 Sava Chankov, sava@cpan.org This software may be freely copied and distributed under the same terms and conditions as Perl. =head1 SEE ALSO perl(1), Encode. =cut