ruby/magic.rb
SuzTiki:ruby
@
magic.rb
class Magic
def getstr(l)
val = '';
while !(l.empty?)
case l
when /^\\([0-7])([0-7])([0-7])/ then
v = ($1.to_i * 8 + $2.to_i) * 8 + $3.to_i
val << v;
when /^\\(.)/ then
val << $1.to_s;
when /^([^\\\s]+)/ then
val << $1;
when /^([\s]+)/ then break;
when /.*/ then break;
end
l = $';
end
l.gsub!(/^[\s]*/,'');
return val , l ;
end
def getnum(l)
val = 0;
case l
when /(0x[0-9a-fA-F]+)[\s]+/ then
l = $';
val = eval($1);
when /([0-9]+)[\s]+/ then
l = $';
val = eval($1);
end
l = $';
l.gsub!(/^[\s]*/,'');
return val, l;
end
def initialize(fname = "etc/magic.mime");
@magic = [];
m = File::open(fname , 'r');
if (m == nil)
else
while (l = m.gets)
next if (l =~ /^[\s]*#/);
next if (l =~ /^[\s]*$/);
case l
when /^>>([0-9]+)[\s]+/ then lvl = 2;
when /^>([0-9]+)[\s]+/ then lvl = 1;
when /^([0-9]+)[\s]+/ then lvl = 0;
when /.*/ then next;
end
off = $1.to_i;
l = $';
case l
when /^(byte|short|long)[\s]+/ then type = 0;
when /^(beshort|belong)[\s]+/ then type = 1;
when /^(leshort|lelong)[\s]+/ then type = 2;
when /^(string)[\s]+/ then type = 3;
when /.*/ then next;
end
type_str = $1;
l = $';
val = nil;
if (type == 3) # string
val , l = getstr(l);
width = val.length;
else
val , l = getnum(l);
case type_str
when /byte/ then width = 1;
when /short/ then width = 2;
when /long/ then width = 4;
end
end
name = nil;
if (l =~ /^([^\s]+)/)
name = $1;
end
@magic << [ lvl , type, off , width, val , name ] ;
#printf("%d %d %d %d %s => %s\n",lvl,off,width,type,val.to_s,name);
end
m.close();
end
end
def match_str(off,len, val)
begin
return (@buf[off,len] == val);
rescue
return false;
end
end
def match_le(off,len,val)
#printf("match_le %d-%d = %d\n",off,len,val);
v = 0;
off += len - 1;
begin
while (len != 0)
v *= 256;
v += @buf[off];
len -= 1;
off -= 1;
end
return (v == val);
rescue
return false;
end
end
def match_be(off,len,val)
v = 0;
begin
while (len != 0)
v *= 256;
v += @buf[off];
len -= 1;
off += 1;
end
return (v == val);
rescue
return false;
end
end
def file(fname)
m = File::open(fname , 'r');
return nil if (m == nil);
m.binmode;
@buf = m.read(256);
m.close();
match = [];
cur_lvl = 0;
@magic.each {|ent| # [ lvl , type, off , width, val , name ]
next if (ent[0] !=0 && ent[0] != cur_lvl);
cur_lvl = 0 if (ent[0] == 0);
result = false;
case ent[1] # 0: native 1: be 2: le 3: str
when 0 then result = match_le(ent[2],ent[3],ent[4])
when 1 then result = match_be(ent[2],ent[3],ent[4])
when 2 then result = match_le(ent[2],ent[3],ent[4])
when 3 then result = match_str(ent[2],ent[3],ent[4])
end
next if (!result);
return ent[5] if (ent[5] != nil);
cur_lvl += 1;
}
return "text/plain";
end
end
# m = Magic.new()
# print m.file("pack.o"),"\n";
# print m.file("pack.c"),"\n";
# print m.file("icons/suz.gif"),"\n";
# print m.file("magic.rb"),"\n";
# print m.file("html/index.html"),"\n";
@
mime の定義ファイル: NetBSD の /usr/share/misc/magic.mime
# $NetBSD: magic.mime,v 1.2 2000/05/14 23:28:43 soren Exp $
# Magic data for KMimeMagic (originally for file(1) command)
#
# The format is 4-5 columns:
# Column #1: byte number to begin checking from, ">" indicates continuation
# Column #2: type of data to match
# Column #3: contents of data to match
# Column #4: MIME type of result
# Column #5: MIME encoding of result (optional)
#------------------------------------------------------------------------------
# Localstuff: file(1) magic for locally observed files
# Add any locally observed files here.
#------------------------------------------------------------------------------
# end local stuff
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# Java
0 short 0xcafe
>2 short 0xbabe application/java
#------------------------------------------------------------------------------
# audio: file(1) magic for sound formats
#
# from Jan Nicolai Langfeldt <janl@ifi.uio.no>,
#
# Sun/NeXT audio data
0 string .snd
>12 belong 1 audio/basic
>12 belong 2 audio/basic
>12 belong 3 audio/basic
>12 belong 4 audio/basic
>12 belong 5 audio/basic
>12 belong 6 audio/basic
>12 belong 7 audio/basic
>12 belong 23 audio/x-adpcm
# DEC systems (e.g. DECstation 5000) use a variant of the Sun/NeXT format
# that uses little-endian encoding and has a different magic number
# (0x0064732E in little-endian encoding).
0 lelong 0x0064732E
>12 lelong 1 audio/x-dec-basic
>12 lelong 2 audio/x-dec-basic
>12 lelong 3 audio/x-dec-basic
>12 lelong 4 audio/x-dec-basic
>12 lelong 5 audio/x-dec-basic
>12 lelong 6 audio/x-dec-basic
>12 lelong 7 audio/x-dec-basic
# compressed (G.721 ADPCM)
>12 lelong 23 audio/x-dec-adpcm
# Bytes 0-3 of AIFF, AIFF-C, & 8SVX audio files are "FORM"
# AIFF audio data
8 string AIFF audio/x-aiff
# AIFF-C audio data
8 string AIFC audio/x-aiff
# IFF/8SVX audio data
8 string 8SVX audio/x-aiff
# Creative Labs AUDIO stuff
# Standard MIDI data
0 string MThd audio/unknown
#>9 byte >0 (format %d)
#>11 byte >1 using %d channels
# Creative Music (CMF) data
0 string CTMF audio/unknown
# SoundBlaster instrument data
0 string SBI audio/unknown
# Creative Labs voice data
0 string Creative\ Voice\ File audio/unknown
## is this next line right? it came this way...
#>19 byte 0x1A
#>23 byte >0 - version %d
#>22 byte >0 \b.%d
# [GRR 950115: is this also Creative Labs? Guessing that first line
# should be string instead of unknown-endian long...]
#0 long 0x4e54524b MultiTrack sound data
#0 string NTRK MultiTrack sound data
#>4 long x - version %ld
# Microsoft WAVE format (*.wav)
# [GRR 950115: probably all of the shorts and longs should be leshort/lelong]
# Microsoft RIFF
0 string RIFF audio/unknown
# - WAVE format
>8 string WAVE audio/x-wav
#
0 belong 0x2e7261fd application/x-realaudio
# MPEG Layer 3 sound files
# Modified the 11/20/97 at 15:59:04 by Christophe Prud'homme <christophe.prudhomme@asci.fr>
0 belong 0xfffb audio/x-mp3
#------------------------------------------------------------------------------
# c-lang: file(1) magic for C programs or various scripts
#
# XPM icons (Greg Roelofs, newt@uchicago.edu)
# ideally should go into "images", but entries below would tag XPM as C source
0 string /*\ XPM image/x-xpm 7bit
# this first will upset you if you're a PL/1 shop... (are there any left?)
# in which case rm it; ascmagic will catch real C programs
# C or REXX program text
#0 string /* text/x-c
# C++ program text
#0 string // text/x-c++
#------------------------------------------------------------------------------
# commands: file(1) magic for various shells and interpreters
#
#0 string :\ shell archive or commands for antique kernel text
0 string #!/bin/sh application/x-shellscript
0 string #!\ /bin/sh application/x-shellscript
0 string #!/bin/csh application/x-shellscript
0 string #!\ /bin/csh application/x-shellscript
# korn shell magic, sent by George Wu, gwu@clyde.att.com
0 string #!/bin/ksh application/x-shellscript
0 string #!\ /bin/ksh application/x-shellscript
0 string #!/bin/tcsh application/x-shellscript
0 string #!\ /bin/tcsh application/x-shellscript
0 string #!/usr/local/tcsh application/x-shellscript
0 string #!\ /usr/local/tcsh application/x-shellscript
0 string #!/usr/local/bin/tcsh application/x-shellscript
0 string #!\ /usr/local/bin/tcsh application/x-shellscript
# bash shell magic, from Peter Tobias (tobias@server.et-inf.fho-emden.de)
0 string #!/bin/bash application/x-shellscript
0 string #!\ /bin/bash application/x-shellscript
0 string #!/usr/local/bin/bash application/x-shellscript
0 string #!\ /usr/local/bin/bash application/x-shellscript
#
# zsh/ash/ae/nawk/gawk magic from cameron@cs.unsw.oz.au (Cameron Simpson)
0 string #!/usr/local/bin/zsh application/x-shellscript
0 string #!\ /usr/local/bin/zsh application/x-shellscript
0 string #!/usr/local/bin/ash application/x-shellscript
0 string #!\ /usr/local/bin/ash application/x-shellscript
#0 string #!/usr/local/bin/ae Neil Brown's ae
#0 string #!\ /usr/local/bin/ae Neil Brown's ae
0 string #!/bin/nawk application/x-nawk
0 string #!\ /bin/nawk application/x-nawk
0 string #!/usr/bin/nawk application/x-nawk
0 string #!\ /usr/bin/nawk application/x-nawk
0 string #!/usr/local/bin/nawk application/x-nawk
0 string #!\ /usr/local/bin/nawk application/x-nawk
0 string #!/bin/gawk application/x-gawk
0 string #!\ /bin/gawk application/x-gawk
0 string #!/usr/bin/gawk application/x-gawk
0 string #!\ /usr/bin/gawk application/x-gawk
0 string #!/usr/local/bin/gawk application/x-gawk
0 string #!\ /usr/local/bin/gawk application/x-gawk
#
0 string #!/bin/awk application/x-awk
0 string #!\ /bin/awk application/x-awk
0 string #!/usr/bin/awk application/x-awk
0 string #!\ /usr/bin/awk application/x-awk
0 string BEGIN application/x-awk
# For Larry Wall's perl language. The ``eval'' line recognizes an
# outrageously clever hack for USG systems.
# Keith Waclena <keith@cerberus.uchicago.edu>
0 string #!/bin/perl application/x-perl
0 string #!\ /bin/perl application/x-perl
0 string eval\ "exec\ /bin/perl application/x-perl
0 string #!/usr/bin/perl application/x-perl
0 string #!\ /usr/bin/perl application/x-perl
0 string eval\ "exec\ /usr/bin/perl application/x-perl
0 string #!/usr/local/bin/perl application/x-perl
0 string #!\ /usr/local/bin/perl application/x-perl
0 string eval\ "exec\ /usr/local/bin/perl application/x-perl
#------------------------------------------------------------------------------
# compress: file(1) magic for pure-compression formats (no archives)
#
# compress, gzip, pack, compact, huf, squeeze, crunch, freeze, yabba, whap, etc.
#
# Formats for various forms of compressed data
# Formats for "compress" proper have been moved into "compress.c",
# because it tries to uncompress it to figure out what's inside.
# standard unix compress
0 string \037\235 application/x-compress
# gzip (GNU zip, not to be confused with [Info-ZIP/PKWARE] zip archiver)
0 string \037\213 application/x-gzip
0 string PK\003\004 application/x-zip
# According to gzip.h, this is the correct byte order for packed data.
0 string \037\036 application/octet-stream
#
# This magic number is byte-order-independent.
#
0 short 017437 application/octet-stream
# XXX - why *two* entries for "compacted data", one of which is
# byte-order independent, and one of which is byte-order dependent?
#
# compacted data
0 short 0x1fff application/octet-stream
0 string \377\037 application/octet-stream
# huf output
0 short 0145405 application/octet-stream
# Squeeze and Crunch...
# These numbers were gleaned from the Unix versions of the programs to
# handle these formats. Note that I can only uncrunch, not crunch, and
# I didn't have a crunched file handy, so the crunch number is untested.
# Keith Waclena <keith@cerberus.uchicago.edu>
#0 leshort 0x76FF squeezed data (CP/M, DOS)
#0 leshort 0x76FE crunched data (CP/M, DOS)
# Freeze
#0 string \037\237 Frozen file 2.1
#0 string \037\236 Frozen file 1.0 (or gzip 0.5)
# lzh?
#0 string \037\240 LZH compressed data
257 string ustar\0 application/x-tar posix
257 string ustar\040\040\0 application/x-tar gnu
0 short 070707 application/x-cpio
0 short 0143561 application/x-cpio swapped
0 string =<ar> application/x-archive
0 string !<arch> application/x-archive
>8 string debian application/x-debian-package
#------------------------------------------------------------------------------
#
# RPM: file(1) magic for Red Hat Packages Erik Troan (ewt@redhat.com)
#
0 beshort 0xedab
>2 beshort 0xeedb application/x-rpm
0 lelong&0x8080ffff 0x0000081a application/x-arc lzw
0 lelong&0x8080ffff 0x0000091a application/x-arc squashed
0 lelong&0x8080ffff 0x0000021a application/x-arc uncompressed
0 lelong&0x8080ffff 0x0000031a application/x-arc packed
0 lelong&0x8080ffff 0x0000041a application/x-arc squeezed
0 lelong&0x8080ffff 0x0000061a application/x-arc crunched
0 leshort 0xea60 application/octet-stream x-arj
# LHARC/LHA archiver (Greg Roelofs, newt@uchicago.edu)
2 string -lh0- application/x-lharc lh0
2 string -lh1- application/x-lharc lh1
2 string -lz4- application/x-lharc lz4
2 string -lz5- application/x-lharc lz5
# [never seen any but the last; -lh4- reported in comp.compression:]
2 string -lzs- application/x-lha lzs
2 string -lh\ - application/x-lha lh
2 string -lhd- application/x-lha lhd
2 string -lh2- application/x-lha lh2
2 string -lh3- application/x-lha lh3
2 string -lh4- application/x-lha lh4
2 string -lh5- application/x-lha lh5
# Shell archives
10 string #\ This\ is\ a\ shell\ archive application/octet-stream x-shell
#------------------------------------------------------------------------------
# frame: file(1) magic for FrameMaker files
#
# This stuff came on a FrameMaker demo tape, most of which is
# copyright, but this file is "published" as witness the following:
#
0 string \<MakerFile application/x-frame
0 string \<MIFFile application/x-frame
0 string \<MakerDictionary application/x-frame
0 string \<MakerScreenFon application/x-frame
0 string \<MML application/x-frame
0 string \<Book application/x-frame
0 string \<Maker application/x-frame
#------------------------------------------------------------------------------
# html: file(1) magic for HTML (HyperText Markup Language) docs
#
# from Daniel Quinlan <quinlan@yggdrasil.com>
#
0 string \<HEAD text/html
0 string \<head text/html
0 string \<TITLE text/html
0 string \<title text/html
0 string \<html text/html
0 string \<HTML text/html
0 string \<!-- text/html
0 string \<h1 text/html
0 string \<H1 text/html
0 string \<!doctype\ HTML text/html
0 string \<!DOCTYPE\ HTML text/html
0 string \<!doctype\ html text/html
#------------------------------------------------------------------------------
# images: file(1) magic for image formats (see also "c-lang" for XPM bitmaps)
#
# originally from jef@helios.ee.lbl.gov (Jef Poskanzer),
# additions by janl@ifi.uio.no as well as others. Jan also suggested
# merging several one- and two-line files into here.
#
# XXX - byte order for GIF and TIFF fields?
# [GRR: TIFF allows both byte orders; GIF is probably little-endian]
#
# [GRR: what the hell is this doing in here?]
#0 string xbtoa btoa'd file
# PBMPLUS
# PBM file
0 string P1 image/x-portable-bitmap 7bit
# PGM file
0 string P2 image/x-portable-greymap 7bit
# PPM file
0 string P3 image/x-portable-pixmap 7bit
# PBM "rawbits" file
0 string P4 image/x-portable-bitmap
# PGM "rawbits" file
0 string P5 image/x-portable-greymap
# PPM "rawbits" file
0 string P6 image/x-portable-pixmap
# NIFF (Navy Interchange File Format, a modification of TIFF)
# [GRR: this *must* go before TIFF]
0 string IIN1 image/x-niff
# TIFF and friends
# TIFF file, big-endian
0 string MM image/tiff
# TIFF file, little-endian
0 string II image/tiff
# possible GIF replacements; none yet released!
# (Greg Roelofs, newt@uchicago.edu)
#
# GRR 950115: this was mine ("Zip GIF"):
# ZIF image (GIF+deflate alpha)
0 string GIF94z image/unknown
#
# GRR 950115: this is Jeremy Wohl's Free Graphics Format (better):
# FGF image (GIF+deflate beta)
0 string FGF95a image/unknown
#
# GRR 950115: this is Thomas Boutell's Portable Bitmap Format proposal
# (best; not yet implemented):
# PBF image (deflate compression)
0 string PBF image/unknown
# GIF
0 string GIF image/gif
# JPEG images
0 beshort 0xffd8 image/jpeg
# PC bitmaps (OS/2, Windoze BMP files) (Greg Roelofs, newt@uchicago.edu)
0 string BM image/bmp
#>14 byte 12 (OS/2 1.x format)
#>14 byte 64 (OS/2 2.x format)
#>14 byte 40 (Windows 3.x format)
#0 string IC icon
#0 string PI pointer
#0 string CI color icon
#0 string CP color pointer
#0 string BA bitmap array
#------------------------------------------------------------------------------
# lisp: file(1) magic for lisp programs
#
# various lisp types, from Daniel Quinlan (quinlan@yggdrasil.com)
0 string ;; text/plain 8bit
# Emacs 18 - this is always correct, but not very magical.
0 string \012( application/x-elc
# Emacs 19
0 string ;ELC\023\000\000\000 application/x-elc
#------------------------------------------------------------------------------
# mail.news: file(1) magic for mail and news
#
# There are tests to ascmagic.c to cope with mail and news.
0 string Relay-Version: message/rfc822 7bit
0 string #!\ rnews message/rfc822 7bit
0 string N#!\ rnews message/rfc822 7bit
0 string Forward\ to message/rfc822 7bit
0 string Pipe\ to message/rfc822 7bit
0 string Return-Path: message/rfc822 7bit
0 string Path: message/news 8bit
0 string Xref: message/news 8bit
0 string From: message/rfc822 7bit
0 string Article message/news 8bit
#------------------------------------------------------------------------------
# msword: file(1) magic for MS Word files
#
# Contributor claims:
# Reversed-engineered MS Word magic numbers
#
0 string \376\067\0\043 application/msword
0 string \320\317\021\340\241\261 application/msword
0 string \333\245-\0\0\0 application/msword
#------------------------------------------------------------------------------
# printer: file(1) magic for printer-formatted files
#
# PostScript
0 string %! application/postscript
0 string \004%! application/postscript
# Acrobat
# (due to clamen@cs.cmu.edu)
0 string %PDF- application/pdf
#------------------------------------------------------------------------------
# sc: file(1) magic for "sc" spreadsheet
#
38 string Spreadsheet application/x-sc
#------------------------------------------------------------------------------
# tex: file(1) magic for TeX files
#
# XXX - needs byte-endian stuff (big-endian and little-endian DVI?)
#
# From <conklin@talisman.kaleida.com>
# Although we may know the offset of certain text fields in TeX DVI
# and font files, we can't use them reliably because they are not
# zero terminated. [but we do anyway, christos]
0 string \367\002 application/x-dvi
#0 string \367\203 TeX generic font data
#0 string \367\131 TeX packed font data
#0 string \367\312 TeX virtual font data
#0 string This\ is\ TeX, TeX transcript text
#0 string This\ is\ METAFONT, METAFONT transcript text
# There is no way to detect TeX Font Metric (*.tfm) files without
# breaking them apart and reading the data. The following patterns
# match most *.tfm files generated by METAFONT or afm2tfm.
2 string \000\021 application/x-tex-tfm
2 string \000\022 application/x-tex-tfm
#>34 string >\0 (%s)
# Texinfo and GNU Info, from Daniel Quinlan (quinlan@yggdrasil.com)
#0 string \\input\ texinfo Texinfo source text
#0 string This\ is\ Info\ file GNU Info text
# correct TeX magic for Linux (and maybe more)
# from Peter Tobias (tobias@server.et-inf.fho-emden.de)
#
0 leshort 0x02f7 application/x-dvi
# RTF - Rich Text Format
0 string {\\rtf text/rtf
#------------------------------------------------------------------------------
# animation: file(1) magic for animation/movie formats
#
# animation formats, originally from vax@ccwf.cc.utexas.edu (VaX#n8)
# MPEG file
0 string \000\000\001\263 video/mpeg
# FLI animation format
0 leshort 0xAF11 video/fli
# FLC animation format
0 leshort 0xAF12 video/flc
# AVI
>8 string AVI\ video/avi
#
# SGI and Apple formats
#
0 string MOVI video/sgi
4 string moov video/quicktime moov
4 string mdat video/quicktime mdat
# The contributor claims:
# I couldn't find a real magic number for these, however, this
# -appears- to work. Note that it might catch other files, too,
# so BE CAREFUL!
#
# Note that title and author appear in the two 20-byte chunks
# at decimal offsets 2 and 22, respectively, but they are XOR'ed with
# 255 (hex FF)! DL format SUCKS BIG ROCKS.
#
# DL file version 1 , medium format (160x100, 4 images/screen)
0 byte 1 video/unknown
0 byte 2 video/unknown
#
# Databases
#
# GDBM magic numbers
# Will be maintained as part of the GDBM distribution in the future.
# <downsj@teeny.org>
0 belong 0x13579ace application/x-gdbm
0 lelong 0x13579ace application/x-gdbm
0 string GDBM application/x-gdbm
#
0 belong 0x061561 application/x-dbm
#
# Executables
#
0 string \177ELF
>4 byte 0
>4 byte 1
>4 byte 2
>5 byte 0
>5 byte 1
>>16 leshort 0
>>16 leshort 1 application/x-object
>>16 leshort 2 application/x-executable
>>16 leshort 3 application/x-sharedlib
>>16 leshort 4 application/x-coredump
#
# DOS
0 string MZ application/x-dosexec
#
# KDE
0 string [KDE\ Desktop\ Entry] application/x-kdelnk
0 string \#\ KDE\ Config\ File application/x-kdelnk
# xmcd database file for kscd
0 string \#\ xmcd text/xmcd
#------------------------------------------------------------------------------
# pkgadd: file(1) magic for SysV R4 PKG Datastreams
#
0 string #\ PaCkAgE\ DaTaStReAm application/x-svr4-package
@
追加しとくと良いもの
# PNG
0 string \211PNG image/png
(最終更新 Thu Mar 30 18:43:57 2006)