package File::MimeInfo::Magic;
use strict;
use Carp;
use Fcntl 'SEEK_SET';
use File::BaseDir qw/xdg_data_files/;
require File::MimeInfo;
require Exporter;
BEGIN {
no strict "refs";
for (qw/extensions describe globs inodetype default/) {
*{$_} = \&{"File::MimeInfo::$_"};
}
}
our @ISA = qw(Exporter File::MimeInfo);
our @EXPORT = qw(mimetype);
our @EXPORT_OK = qw(extensions describe globs inodetype magic);
our $VERSION = '0.14';
our $DEBUG;
our (@magic_80, @magic, $max_buffer);
# @magic_80 and @magic are used to store the parse tree of magic data
# @magic_80 contains magic rules with priority 80 and higher, @magic the rest
# $max_buffer contains the maximum number of chars to be buffered from a non-seekable
# filehandle in order to do magic mimetyping
_rehash(); # initialize data
# use Data::Dumper;
# print Dumper \@magic_80, \@magic;
sub mimetype {
my $file = pop;
croak 'subroutine "mimetype" needs a filename as argument' unless defined $file;
return magic($file) || default($file) if ref $file;
return &File::MimeInfo::mimetype($file) unless -s $file and -r _;
my ($mimet, $fh);
return $mimet if $mimet = inodetype($file);
($mimet, $fh) = _magic($file, \@magic_80); # high priority rules
return $mimet if $mimet;
return $mimet if $mimet = globs($file);
($mimet, $fh) = _magic($fh, \@magic); # lower priority rules
close $fh unless ref $file;
return $mimet if $mimet;
return default($file);
}
sub magic {
my $file = pop;
croak 'subroutine "magic" needs a filename as argument' unless defined $file;
return undef unless ref($file) || -s $file;
print STDERR "> Checking all magic rules\n" if $DEBUG;
my ($mimet, $fh) = _magic($file, \@magic_80, \@magic);
close $fh unless ref $file;
return $mimet;
}
sub _magic {
my ($file, @rules) = @_;
my $fh;
unless (ref $file) {
open $fh, '<', $file || return undef;
binmode $fh;
}
else { $fh = $file }
for my $type (map @$_, @rules) {
for (2..$#$type) {
next unless _check_rule($$type[$_], $fh, 0);
close $fh unless ref $file;
return ($$type[1], $fh);
}
}
return (undef, $fh);
}
sub _check_rule {
my ($ref, $fh, $lev) = @_;
my $line;
if (ref $fh eq 'GLOB') {
seek($fh, $$ref[1][0], SEEK_SET);
read($fh, $line, $$ref[1][1]);
}
else { # allowing for IO::Something
$fh->seek($$ref[1][0], SEEK_SET);
$fh->read($line, $$ref[1][1]);
}
return undef unless $line =~ $$ref[2];
my $succes;
unless ($$ref[3]) { $succes++ }
else { # mask stuff
my $v = $2 & $$ref[3][1];
$succes++ if $v eq $$ref[3][0];
}
print STDERR '>', '>'x$lev, ' Value "', _escape_bytes($2),
'" at offset ', $$ref[1][0]+length($1),
" matches line $$ref[0]\n"
if $succes && $DEBUG;
return undef unless $succes;
if ($#$ref > 3) {
for (4..$#$ref) { # recurs
$succes = _check_rule($$ref[$_], $fh, $lev+1);
last if $succes;
}
}
print STDERR "> Failed nested rules\n" if $DEBUG && !($lev || $succes);
return $succes;
}
sub rehash {
&File::MimeInfo::rehash();
&_rehash();
}
sub _rehash {
($max_buffer, @magic_80, @magic) = (32); # clear data
my @magicfiles = @File::MimeInfo::DIRS
? ( grep {-e $_ && -r $_} map "$_/magic", @File::MimeInfo::DIRS )
: ( reverse xdg_data_files('mime/magic') );
my @done;
for my $file (@magicfiles) {
next if grep {$file eq $_} @done;
_hash_magic($file);
push @done, $file;
}
@magic = sort {$$b[0] <=> $$a[0]} @magic;
while ($magic[0][0] >= 80) {
push @magic_80, shift @magic;
}
}
sub _hash_magic {
my $file = shift;
open MAGIC, '<', $file || croak "Could not open file '$file' for reading";
binmode MAGIC;
<MAGIC> eq "MIME-Magic\x00\n"
or carp "Magic file '$file' doesn't seem to be a magic file";
my $line = 1;
while (<MAGIC>) {
$line++;
if (/^\[(\d+):(.*?)\]\n$/) {
push @magic, [$1,$2];
next;
}
s/^(\d*)>(\d+)=(.{2})//s || carp "$file line $line skipped\n" && next;
my ($i, $o, $l) = ($1, $2, unpack 'n', $3); # indent, offset, value length
while (length($_) <= $l) {
$_ .= <MAGIC>;
$line++;
}
my $v = substr $_, 0, $l, ''; # value
/^(?:&(.{$l}))?(?:~(\d+))?(?:\+(\d+))?\n$/s
|| carp "$file line $line skipped\n" && next;
my ($m, $w, $r) = ($1, $2, $3 || 0); # mask, word size, range
# the word size is given for big endian to little endian conversion
# dunno whether we need to do that in perl
my $end = $o + $l + $r;
$max_buffer = $end if $max_buffer < $end;
my $ref = $i ? _find_branch($i) : $magic[-1];
my $reg = '^'
. ( $r ? "(.{0,$r}?)" : '()' )
. ( $m ? "(.{$l})" : '('.quotemeta($v).')' ) ;
push @$ref, [
$line,
[$o, $end],
qr/$reg/sm,
$m ? [$v, $m] : 0
];
}
close MAGIC;
}
sub _find_branch {
my $i = shift;
my $ref = $magic[-1];
for (1..$i) { $ref = $$ref[-1] }
return $ref;
}
sub _escape_bytes {
my $string = shift;
if ($string =~ /[\x00-\x1F\xF7]/) {
$string = join '', map {
my $o = ord($_);
($o < 32) ? '^' . chr($o + 64) :
($o == 127) ? '^?' : $_ ;
} split '', $string;
}
return $string;
}
1;
__END__
=head1 NAME
File::MimeInfo::Magic - Determine file type with magic
=head1 SYNOPSIS
use File::MimeInfo::Magic;
my $mime_type = mimetype($file);
=head1 DESCRIPTION
This module inherits from L<File::MimeInfo>, it is transparant
to its functions but adds support for the freedesktop magic file.
=head1 EXPORT
The method C<mimetype> is exported by default. The methods C<magic>,
C<inodetype>, C<globs> and C<describe> can be exported on demand.
=head1 METHODS
See also L<File::MimeInfo> for methods that are inherited.
=over 4
=item C<mimetype($file)>
Returns a mime-type string for C<$file>, returns undef on failure.
This method bundles C<inodetype()>, C<globs()> and C<magic()>.
Magic rules with an priority of 80 and higher are checked before
C<globs()> is called, all other magic rules afterwards.
If this doesn't work the file is read and the mime-type defaults
to 'text/plain' or to 'application/octet-stream' when the first ten chars
of the file match ascii control chars (white spaces excluded).
If the file doesn't exist or isn't readable C<undef> is returned.
If C<$file> is an object reference only C<magic> and the default method
are used.
=item C<magic($file)>
Returns a mime-type string for C<$file> based on the magic rules,
returns undef on failure.
C<$file> can be an object reference, in that case it is supposed to have a
C<seek()> and a C<read()> method.
This allows you for example to determine the mimetype of data in memory
by using L<IO::Scalar>.
=item C<rehash()>
Rehash the data files. Glob and magic
information is preparsed when this method is called.
If you want to by-pass the XDG basedir system you can specify your database
directories by setting C<@File::MimeInfo::DIRS>. But normally it is better to
change the XDG basedir environment variables.
=back
=head1 SEE ALSO
L<File::MimeInfo>
=head1 BUGS
Please mail the author when you encounter any bugs.
Most likely to cause bugs is the fact that I partially used line based parsing
while the source data is binary and can contain newlines on strange places.
I tested with the 0.11 version of the database and found no problems, but I
can think of configurations that can cause problems.
=head1 AUTHOR
Jaap Karssenberg || Pardus [Larus] E<lt>pardus@cpan.orgE<gt>
Copyright (c) 2003 Jaap G Karssenberg. All rights reserved.
This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
syntax highlighted by Code2HTML, v. 0.9.1