package File::Type; =head1 NAME File::Type - Determine a file's contents by looking at the name and contents =head1 SYNOPSIS use File::Type qw( get_type type_2_mime ) ; File::Type::load_magic( "type_file" ) ; File::Type::load_magic( \@type_defs ) ; my $file_type = get_type( "foo.pl" ) ; print type_2_mime( $get_type ) ; =head1 DESCRIPTION A perl module that acts a lot like the traditional Unix C command, but using regular expressions to do the job. File types are defined in a data structure that's passed in or in a file that contains such a data structure. Default file types important to Safari are defined in the module, so you don't need to C in some cases. =cut ################################################################ use strict ; use Exporter ; use vars qw( $VERSION @ISA @EXPORT_OK ) ; @ISA = qw( Exporter ) ; @EXPORT_OK = qw( add_mime_types get_type load_magic type_2_file_output type_2_mime ) ; $VERSION = 0.10 ; ################################################################ ################################################################ # # Globals # # # This global variable is set to the file name being evaluated before # any tests are run. DO NOT CHANGE IT IN A TEST!!! # my $_file_name ; # # This global variable is set if the caller supplied a (possibly empty) # string containing the guts of the file being tested. # my $_file_guts_provided ; # # The data structure currently in use # my $_current_magic ; # # The number of rounds of testing contained in $_current_magic # my $_max_test ; # # Used to cache the value of -T $_file_name # my $_op_T_result ; # # The name of the file type currently being evaluated # my $_type_name ; # # The name of the currently executing test # my $_test_name ; # # hmmm..... # my $debug = 0 ; ################################################################ ################################################################ =head2 FUNCTIONS =over =item add_magic Adds more types to the current magic database. NOT IMPLEMENTED ############################# =item add_mime_types Adds the contents of a mime types file to the current magic database. =cut sub add_mime_types { my $mime_file_name = shift ; open( MIMES, "<$mime_file_name" ) or die "$!: $mime_file_name" ; while ( ) { # # Parse lines that look like "text/html html htm" # next if /^\s*#/ ; next unless m@^\s*([\w.-/]+)\s+(\S(?:.+\S)?)\s*$@ ; my $mime_type = $1 ; my @exts = split( /\s+/, $2 ) ; # # See if any existing items have one or more of these extensions # by pretending we have a file with that extension and no contents # and seeing if test #1 returns a type name. If so, set the mime # type. # my %done ; my $type_names = {} ; map { $type_names->{$_} = 1 ; } keys %$_current_magic ; for my $ext ( @exts ) { $_file_name = "a.$ext" ; my @results = _do_test( $type_names, 1 ) ; for my $type_name ( @results ) { $_current_magic->{$type_name}->[0]->[0] = $mime_type ; $done{$ext} = 1 ; } } # # Create new file types for any mime types we didn't find. # my @remaining = grep{ ! exists $done{$_} ; } @exts ; if ( @remaining ) { $_current_magic->{$mime_type} = [ [ $mime_type, $mime_type ], \@remaining, ] ; } } close( MIMES ) or die "$!: $mime_file_name" ; #for ( sort keys %$_current_magic ) { # print( $_, " => [ [ ", join( ',', @{$_current_magic->{$_}->[0]} ), "], [", ( ref( $_current_magic->{$_}[1]) eq 'ARRAY' ? join( ',', @{$_current_magic->{$_}->[1]} ) : 'undef' ), "] ]\n" ) ; #} # Should rescan $_current_magic to get $_max_test in case it grew... } ############################# =item load_magic C takes either a file name or a reference to an array and sets up the internal data structures needed by C and C. See the source code for the module for more information on the data structure required. The types included with this module are not that comprehensive, since Safari needs to know about very few of them. Submissions of new and better recognizers are appreciated. =cut sub load_magic { my $types = shift ; if ( ! ref( $types ) ) { my $new_magic = do $types ; unless (defined( $new_magic )) { die "couldn't parse $types: $@" if $@; die "couldn't do $types: $!" unless defined $new_magic; die "couldn't run $types" unless $new_magic; } die "$types must enclose all data in '[' and ']'" unless ref( $new_magic ) eq 'ARRAY' ; $types = $new_magic ; } ; $_current_magic = $types ; $_max_test = 1 ; for $_type_name ( keys %$_current_magic ) { my $type = $_current_magic->{$_type_name} ; # # Line noise $#{$type} is the index of the last element in @{$type}. # my $_max_test_for_type = $#{$type} ; $_max_test = $_max_test_for_type if $_max_test_for_type > $_max_test ; } return 1 ; } ############################# =item get_type C does three levels of check and returns the result of the first sucessful check. C first stats the file, then looks at it's extension, then looks inside the file using regular expressions. Since perl5 regular expressions are pretty darn comprehensive, this should allow complete emulation of the magic files used by the Unix C command as well as the language identification heuristics. If a second argument is provided, it will be used as the file's contents, and the file will not be opened. The contents must be from the beginning of the file for most binary file types, and should be for most text file types. As much data as is feasible should be provided. =cut sub get_type { # # Set up globals # my $options = {} ; if ( ref( $_[ 0 ] ) eq 'HASH' ) { $options = shift ; $debug ||= $options->{debug} ? $options->{debug} : 0 ; } $_file_name = shift ; $_file_guts_provided = @_ ; $_op_T_result = undef ; if ( defined( $_file_name ) && ! $_file_guts_provided ) { # Note that we -l first here, so as to let the '_' work in the next if return 'symbolic link' if -l $_file_name && ! $options->{'follow'} ; return 'directory' if -d _ ; } my @results ; # # Declare this now and make it undefined. Later it will hold the # first few chunks of the file's guts. Making it undefined allows # -w to warn the user if a filename test fails. # local $_ ; # # Start off by applying tests for all types unless the caller only # wants a short list of types. # my $_type_names ; map{ $_type_names->{$_} = 1 } @{$options->{types}} if exists $options->{types} && @{$options->{types}}; unless ( $_type_names ) { map { $_type_names->{ $_ } = 1 ; } keys %$_current_magic ; } my @tests = ( exists $options->{tests} && @{$options->{tests}} ) ? @{$options->{tests}} : ( 1..$_max_test ) ; my %tests ; map{ $tests{$_} = 1 } @tests ; # # Scan for filename matches # @results = _do_test( $_type_names, 1 ) if defined( $_file_name ) && $tests{1} ; unless ( @results ) { # # Look inside the file # if ( $_file_guts_provided ) { $_ = shift ; } elsif ( defined( $_file_name ) ) { open( FILE, "<$_file_name" ) or die "$!: $_file_name" ; defined( read( FILE, $_, 16384 ) ) or die "$!: $_file_name" ; close( FILE ) or die "$!: $_file_name" ; } # # Scan for guts matches # for ( my $test = 2 ; $test <= $_max_test ; ++$test ) { next unless $tests{$test} ; @results = _do_test( $_type_names, $test ) ; last if @results ; } } return @results ? join( ' ', @results ) : is_text() ? 'text' : 'data' ; } ############################# sub _do_test { my ( $_type_names, $offset ) = @_ ; my @results ; my $score_level = 0 ; $_test_name = ( $offset == 1 ? 'file name test' : 'guts test ' . ( $offset - 1 ) ) ; for my $a ( keys %$_type_names ) { $_type_name = $a ; my $type = $_current_magic->{$_type_name} ; next unless $offset <= $#$type ; my $test = $type->[$offset] ; next unless defined $test ; my $score ; my $test_type = ref( $test ) ; if ( $test_type eq 'ARRAY' ) { if ( $offset == 1 ) { $score = has_extension( @$test ) ; } else { $score = match_and_score( @$test ) ; } } elsif ( $test_type eq 'Regexp' ) { if ( $offset == 1 ) { $score = matches_name( $test ) ; } elsif ( defined( $_ ) ) { $score = m/$test/m ; } } elsif ( $test_type eq 'CODE' ) { $score = &{$test}() ; } else { abort( "Invalid type for test: '$test_type'" ) } next unless defined $score ; abort( "Non-numeric score ($score) returned" ) unless $score =~ /^(?:[-+]?(?:\d+|\d*\.\d*))?$/ ; if ( $score > 0 ) { debug( "score $score" ) ; if ( $score > $score_level ) { @results = ( $_type_name ) ; $score_level = $score ; } elsif ( $score == $score_level ) { push( @results, $_type_name ) ; } } elsif ( $score < 0 ) { debug( "eliminated" ) ; delete $_type_names->{$_type_name} ; } } return @results ; } =item type_2_mime Takes the result from a get_type call and returns the corresponding mime_type. =cut sub type_2_mime { return $_current_magic->{$_[0]}->[0]->[1] ; } =item type_2_file_output Takes the result from a get_type call and returns the corresponding string that resembles the output from the Unix file command. =cut sub type_2_file_output { return $_current_magic->{$_[0]}->[0]->[0] ; } ################################################################ ################################################################ =head2 MAGIC DATA STRUCTURE The format of the magic data structure is: { 'file type' => [ # reported when a match is found [ 'long type' # Unix find-like description 'mime type', # used to translate file type to mime type ], name_test, # the test applied when only the file name is known guts_test_1, # the first test applied if the file name test fails. guts_test_2, # the second test applied if guts_test_1 fails ... ], 'another type' => [ ... ], ... } See C for a description of the testing algorithm. =cut my $default_magic = { 'c' => [ [ 'c program text', 'text/plain', ], [qw( c h )], \&must_be_text, sub{ m/^#!/ ? -1 : 0 }, # No shebang allowed sub{ match_and_score( qr{ ^(?: \#\s*(?: include\s*".*" | include\s*<.*> | if(\s+\defined)? | ifdef\s+\w+ | ifndef\s+\w+ | define\s+\w+ | undef\s+\w+ ) ) | /\*\W* | //.*$ | \b(?: struct | union )\s+\{ | \b(?: s?printf | if | while )\s*\( | (?:(?:const|register|volatile)\s+)? (?: void | int | char )\s*[\*&]?\w* }x )} ], 'c++' => [ [ 'c++ program text', 'text/plain', ], [qw( C H CC HH cc hh cpp hpp )], \&must_be_text, sub{ m/^#!/ ? -1 : 0 }, # No shebang allowed sub{ match_and_score( qr{ ^(?: \#\s*(?: include\s*".*" | include\s*<.*> | if(\s+\defined)? | ifdef\s+\w+ | ifndef\s+\w+ | define\s+\w+ | undef\s+\w+ ) ) | /\*\W* | //.*$ | \b(?: struct | union | class\s*:(?:\s*i(?:public|private|protected)[,\w]+) )\s+\{ | ^\s*(?:public|private|protected): | \b(?: s?printf | if | while )\s*\( | (?:(?:const|register|volatile)\s+)? (?: void | char | (?:long\s+)? (?: int | double | long | float ) )\s*[\*&]?\w* | \b(?:\*\s*)?this\s*(?:->)? }x )} ], 'data' => [ [ 'data', 'application/octet-stream' ], undef, \&must_be_text, ], 'diff' => [ [ 'diff output text', 'text/plain' ], undef, \&must_be_text, undef, sub { match_and_score( qr{ ^ (?:[-+]|[!<>@* =]\s) (?:.(?!^[^ -+!<>\@\*]))* }xms, )}, ], 'gif' => [ [ 'GIF image', 'image/gif' ], [qw( gif GIF )], \&must_be_data, qr/\AGIF8/, ], 'html' => [ [ 'html text', 'text/html', ], [qw( html htm )], \&must_be_text, qr/\A\s*/, sub { match_and_score( qr{ .*.* | | || |

|: |

|||||| |
|

|]+> }xmsi, )}, ], 'makefile' => [ [ 'makefile text', 'text/plain', ], qr/^(?:GNU)?Makefile|\.mak/i, ], 'perl' => [ [ 'perl program text', 'text/plain', ], [qw( pl pm pod PL )], \&must_be_text, qr/\A#!.*perl\b/, sub { match_and_score( qr{ ^(?: =(?:back|begin|end|for|head\d|item|over|pod).*?(^=cut|\Z) | \s*(?: package\s+\w+(?:::)? | use\s+strict | sub\s+\w | while\s+$\s+<>\s+$ | (?:if|unless|while|for|until)\s*$.+$\s*\{ | \#.*\n ) ) | \s*(?: \b(?: if | else | elsif | next | last | my | local | unless | qr | qq | qw | q | eq | ne | gt | lt | ge | le )\b | [\$\@\%\&]+\w+ | "[^"]*" | '[^']*' | <=> | =~ | !~ | = | != | [|&<>]+= | \.\. | \. | s/[^/]+/[^/]+/\w* | tr/[^/]+/[^/]+/\w* | m/[^/]+/\w* | [(),[\]] | ;\s*$ ) }mxs )} ], 'sh' => [ [ 'sh commands', 'text/plain', ], [qw( sh rc )], \&must_be_text, qr/\A#!.*sh\b/, sub { match_and_score( qr{ ^\s*\#.*\n | \s*(?: \b(?: echo | case | esac | if | fi | then | do | done | for | while | cat | sed | awk | find )\b | "[^"]*" | '[^']*' | \| | \& | \; | \&\& | \|\| | (?:[\./]*\w+)+ ) }msx )} ], 'text' => [ [ 'text', 'text/plain' ], undef, \&must_be_text, ], } ; ################################################################ ################################################################ =head2 Primitive tests These functions may be used in the magic data structure as complete tests or as part of other tests. The text / binary primitives only test the file state once and cache the results. =over =cut ############################# =item abort Aborts all testing by dieing with the message passed. =cut sub abort { my $message = shift ; die "$_file_name, $_type_name $_test_name: $message" ; } ############################# =item debug Prints a message if debugging is enabled. =cut sub debug { my $message = shift ; print "$_file_name, $_type_name $_test_name: $message\n" if $debug; } ############################# =item has_extension Returns 1 if the file name has an extension matching any of the arguments =cut sub has_extension { abort "No parameters passed to has_extension()" unless @_ ; my $exts = join( '|', map{ quotemeta( $_ ) } @_ ) ; return $_file_name =~ m/\.(?:$exts)$/ ; } ############################# =item is_text Returns 1 if the file is text, 0 if it is not. =cut sub is_text { abort "Parameter(s) '" . join( "', '", @_ ) . "' passed to is_text()" if @_ ; unless ( defined $_op_T_result ) { if ( $_file_guts_provided ) { # THIS NEEDS TO BE CHECKED AGAINST THE PERL SOURCE: IT'S A ROUGH # APPROXIMATION OF -T my $match_chars = length( join( '', m/([\r\n\t -\x7F]+)/g ) ) ; $_op_T_result = ( $match_chars + 10 ) / length( $_ ) > 0.95 ; } elsif ( defined( $_file_name ) ) { $_op_T_result = ( -T $_file_name ? 1 : 0 ) } } return $_op_T_result ; } ############################# =item is_data Returns 1 if the file is not text, 0 if it is. =cut sub is_data { abort "Parameter(s) '" . join( "', '", @_ ) . "' passed to is_data()" if @_ ; return is_text() ? 0 : 1 ; } ############################# =item match_and_score Returns a score based on where and how many of the words or regular expression arguments match. This is the routine used internally when a word list is used as a test. The strategy is to match as high a percentage of valid source text as is possible without matching nearly as much from other file formats. =cut sub match_and_score { abort "No parameters passed in to match_and_score" unless @_ ; abort "Undefined parameters passed in to match_and_score: " . join( ', ', map{ defined( $_ ) ? $_ : 'undef' } @_ ) if grep{ ! defined( $_ )} @_ ; my $regex = join( '|', map{ ref( $_ ) eq 'Regexp' ? $_ : quotemeta( $_ ) ; } @_ ) ; debug( $regex ) if $debug > 1 ; abort "Pattern matches the empty string" if '' =~ m/$regex/m ; my @matches = split ( m/((?:$regex)+)/m ) ; # Look at the matches my $is_odd = 1 ; my $match_length = 0 ; for ( @matches ) { $is_odd = ! $is_odd ; next unless defined $_ ; if ( ! $is_odd ) { debug( "didn't match '$_'" ) if $debug > 3 && ! /^\s*$/; next ; } $match_length += length( $_ ) ; chomp ; debug( "matched '$_'" ) if $debug > 2 ; } return length( $_ ) ? $match_length * 100 / length( $_ ) : 0 ; } ############################# =item matches_name Returns 1 if the file name matches the regular expression or strings passed in. =cut sub matches_name { abort "No parameters passed to matches_name()" unless @_ ; if ( ref( $_[0] ) ) { return $_file_name =~ m/$_[0]/ ; } else { my $names= join( '|', map{ quotemeta( $_ ) } @_ ) ; return $_file_name =~ m/\.(?:$names)$/ ; } } ############################# =item must_be_text Returns -1 if the file is not text (according to -T), 0 if it is. This is used to disqualify a type for a file without scoring the file, since 0 means 'can't tell', and -1 means it's not that type. =cut sub must_be_text { abort "Parameter(s) '" . join( "', '", @_ ) . "' passed to must_be_text()" if @_ ; return is_text() ? 0 : -1 ; } ############################# =item must_be_data Returns -1 if the file is text (according to -T), 0 otherwise. =cut sub must_be_data { abort "Parameter(s) '" . join( "', '", @_ ) . "' passed to must_be_data()" if @_ ; return is_text() ? -1 : 0 ; } ################################################################ ################################################################ load_magic $default_magic ; #$_file_name = "dlinkedlist.hpp" ; #open( A, $_file_name ) ; # #read(A,$_,16384) ; # #print &{$_current_magic->[0]->[4]}, "\n" ; # #exit ; sub test { my $options = {} ; $options = shift if ( ref( $_[ 0 ] ) eq 'HASH' ) ; @_ = @ARGV unless @_ ; map{ print $_ . ": " . file_type( $options, $_ ) . "\n" } @_ ; } =back =head1 AUTHOR Barrie Slaymaker =cut