file.c 24 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <ctype.h>
  5. #include <mach.h>
  6. /*
  7. * file - determine type of file
  8. */
  9. #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
  10. uchar buf[6001];
  11. short cfreq[140];
  12. short wfreq[50];
  13. int nbuf;
  14. Dir* mbuf;
  15. int fd;
  16. char *fname;
  17. char *slash;
  18. enum
  19. {
  20. Cword,
  21. Fword,
  22. Aword,
  23. Alword,
  24. Lword,
  25. I1,
  26. I2,
  27. I3,
  28. Clatin = 128,
  29. Cbinary,
  30. Cnull,
  31. Ceascii,
  32. Cutf,
  33. };
  34. struct
  35. {
  36. char* word;
  37. int class;
  38. } dict[] =
  39. {
  40. "PATH", Lword,
  41. "TEXT", Aword,
  42. "adt", Alword,
  43. "aggr", Alword,
  44. "alef", Alword,
  45. "array", Lword,
  46. "block", Fword,
  47. "chan", Alword,
  48. "char", Cword,
  49. "common", Fword,
  50. "con", Lword,
  51. "data", Fword,
  52. "dimension", Fword,
  53. "double", Cword,
  54. "extern", Cword,
  55. "bio", I2,
  56. "float", Cword,
  57. "fn", Lword,
  58. "function", Fword,
  59. "h", I3,
  60. "implement", Lword,
  61. "import", Lword,
  62. "include", I1,
  63. "int", Cword,
  64. "integer", Fword,
  65. "iota", Lword,
  66. "libc", I2,
  67. "long", Cword,
  68. "module", Lword,
  69. "real", Fword,
  70. "ref", Lword,
  71. "register", Cword,
  72. "self", Lword,
  73. "short", Cword,
  74. "static", Cword,
  75. "stdio", I2,
  76. "struct", Cword,
  77. "subroutine", Fword,
  78. "u", I2,
  79. "void", Cword,
  80. };
  81. /* codes for 'mode' field in language structure */
  82. enum {
  83. Normal = 0,
  84. First, /* first entry for language spanning several ranges */
  85. Multi, /* later entries " " " ... */
  86. Shared, /* codes used in several languages */
  87. };
  88. struct
  89. {
  90. int mode; /* see enum above */
  91. int count;
  92. int low;
  93. int high;
  94. char *name;
  95. } language[] =
  96. {
  97. Normal, 0, 0x0080, 0x0080, "Extended Latin",
  98. Normal, 0, 0x0100, 0x01FF, "Extended Latin",
  99. Normal, 0, 0x0370, 0x03FF, "Greek",
  100. Normal, 0, 0x0400, 0x04FF, "Cyrillic",
  101. Normal, 0, 0x0530, 0x058F, "Armenian",
  102. Normal, 0, 0x0590, 0x05FF, "Hebrew",
  103. Normal, 0, 0x0600, 0x06FF, "Arabic",
  104. Normal, 0, 0x0900, 0x097F, "Devanagari",
  105. Normal, 0, 0x0980, 0x09FF, "Bengali",
  106. Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi",
  107. Normal, 0, 0x0A80, 0x0AFF, "Gujarati",
  108. Normal, 0, 0x0B00, 0x0B7F, "Oriya",
  109. Normal, 0, 0x0B80, 0x0BFF, "Tamil",
  110. Normal, 0, 0x0C00, 0x0C7F, "Telugu",
  111. Normal, 0, 0x0C80, 0x0CFF, "Kannada",
  112. Normal, 0, 0x0D00, 0x0D7F, "Malayalam",
  113. Normal, 0, 0x0E00, 0x0E7F, "Thai",
  114. Normal, 0, 0x0E80, 0x0EFF, "Lao",
  115. Normal, 0, 0x1000, 0x105F, "Tibetan",
  116. Normal, 0, 0x10A0, 0x10FF, "Georgian",
  117. Normal, 0, 0x3040, 0x30FF, "Japanese",
  118. Normal, 0, 0x3100, 0x312F, "Chinese",
  119. First, 0, 0x3130, 0x318F, "Korean",
  120. Multi, 0, 0x3400, 0x3D2F, "Korean",
  121. Shared, 0, 0x4e00, 0x9fff, "CJK",
  122. Normal, 0, 0, 0, 0, /* terminal entry */
  123. };
  124. enum
  125. {
  126. Fascii, /* printable ascii */
  127. Flatin, /* latin 1*/
  128. Futf, /* UTf character set */
  129. Fbinary, /* binary */
  130. Feascii, /* ASCII with control chars */
  131. Fnull, /* NULL in file */
  132. } guess;
  133. void bump_utf_count(Rune);
  134. int cistrncmp(char*, char*, int);
  135. void filetype(int);
  136. int getfontnum(uchar*, uchar**);
  137. int isas(void);
  138. int isc(void);
  139. int iscint(void);
  140. int isenglish(void);
  141. int ishp(void);
  142. int ishtml(void);
  143. int isrfc822(void);
  144. int ismbox(void);
  145. int islimbo(void);
  146. int ismung(void);
  147. int isp9bit(void);
  148. int isp9font(void);
  149. int isrtf(void);
  150. int ismsdos(void);
  151. int iself(void);
  152. int istring(void);
  153. int iff(void);
  154. int long0(void);
  155. int istar(void);
  156. int isface(void);
  157. int isexec(void);
  158. int p9bitnum(uchar*);
  159. int p9subfont(uchar*);
  160. void print_utf(void);
  161. void type(char*, int);
  162. int utf_count(void);
  163. void wordfreq(void);
  164. int (*call[])(void) =
  165. {
  166. long0, /* recognizable by first 4 bytes */
  167. istring, /* recognizable by first string */
  168. iself, /* ELF (foreign) executable */
  169. isexec, /* native executables */
  170. iff, /* interchange file format (strings) */
  171. isrfc822, /* email file */
  172. ismbox, /* mail box */
  173. istar, /* recognizable by tar checksum */
  174. ishtml, /* html keywords */
  175. iscint, /* compiler/assembler intermediate */
  176. islimbo, /* limbo source */
  177. isc, /* c & alef compiler key words */
  178. isas, /* assembler key words */
  179. ismung, /* entropy compressed/encrypted */
  180. isp9font, /* plan 9 font */
  181. isp9bit, /* plan 9 image (as from /dev/window) */
  182. isenglish, /* char frequency English */
  183. isrtf, /* rich text format */
  184. ismsdos, /* msdos exe (virus file attachement) */
  185. isface, /* ascii face file */
  186. 0
  187. };
  188. int mime;
  189. #define OCTET "application/octet-stream\n"
  190. #define PLAIN "text/plain\n"
  191. void
  192. main(int argc, char *argv[])
  193. {
  194. int i, j, maxlen;
  195. char *cp;
  196. Rune r;
  197. ARGBEGIN{
  198. case 'm':
  199. mime = 1;
  200. break;
  201. default:
  202. fprint(2, "usage: file [-m] [file...]\n");
  203. exits("usage");
  204. }ARGEND;
  205. maxlen = 0;
  206. if(mime == 0 || argc > 1){
  207. for(i = 0; i < argc; i++) {
  208. for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
  209. ;
  210. if(j > maxlen)
  211. maxlen = j;
  212. }
  213. }
  214. if (argc <= 0) {
  215. if(!mime)
  216. print ("stdin: ");
  217. filetype(0);
  218. }
  219. else {
  220. for(i = 0; i < argc; i++)
  221. type(argv[i], maxlen);
  222. }
  223. exits(0);
  224. }
  225. void
  226. type(char *file, int nlen)
  227. {
  228. Rune r;
  229. int i;
  230. char *p;
  231. if(nlen > 0){
  232. slash = 0;
  233. for (i = 0, p = file; *p; i++) {
  234. if (*p == '/') /* find rightmost slash */
  235. slash = p;
  236. p += chartorune(&r, p); /* count runes */
  237. }
  238. print("%s:%*s",file, nlen-i+1, "");
  239. }
  240. fname = file;
  241. if ((fd = open(file, OREAD)) < 0) {
  242. print("cannot open\n");
  243. return;
  244. }
  245. filetype(fd);
  246. close(fd);
  247. }
  248. void
  249. filetype(int fd)
  250. {
  251. Rune r;
  252. int i, f, n;
  253. char *p, *eob;
  254. free(mbuf);
  255. mbuf = dirfstat(fd);
  256. if(mbuf == nil){
  257. print("cannot stat: %r\n");
  258. return;
  259. }
  260. if(mbuf->mode & DMDIR) {
  261. print(mime ? "text/directory\n" : "directory\n");
  262. return;
  263. }
  264. if(mbuf->type != 'M' && mbuf->type != '|') {
  265. print(mime ? OCTET : "special file #%c/%s\n",
  266. mbuf->type, mbuf->name);
  267. return;
  268. }
  269. nbuf = read(fd, buf, sizeof(buf)-1);
  270. if(nbuf < 0) {
  271. print("cannot read\n");
  272. return;
  273. }
  274. if(nbuf == 0) {
  275. print(mime ? PLAIN : "empty file\n");
  276. return;
  277. }
  278. buf[nbuf] = 0;
  279. /*
  280. * build histogram table
  281. */
  282. memset(cfreq, 0, sizeof(cfreq));
  283. for (i = 0; language[i].name; i++)
  284. language[i].count = 0;
  285. eob = (char *)buf+nbuf;
  286. for(n = 0, p = (char *)buf; p < eob; n++) {
  287. if (!fullrune(p, eob-p) && eob-p < UTFmax)
  288. break;
  289. p += chartorune(&r, p);
  290. if (r == 0)
  291. f = Cnull;
  292. else if (r <= 0x7f) {
  293. if (!isprint(r) && !isspace(r))
  294. f = Ceascii; /* ASCII control char */
  295. else f = r;
  296. } else if (r == 0x080) {
  297. bump_utf_count(r);
  298. f = Cutf;
  299. } else if (r < 0xA0)
  300. f = Cbinary; /* Invalid Runes */
  301. else if (r <= 0xff)
  302. f = Clatin; /* Latin 1 */
  303. else {
  304. bump_utf_count(r);
  305. f = Cutf; /* UTF extension */
  306. }
  307. cfreq[f]++; /* ASCII chars peg directly */
  308. }
  309. /*
  310. * gross classify
  311. */
  312. if (cfreq[Cbinary])
  313. guess = Fbinary;
  314. else if (cfreq[Cutf])
  315. guess = Futf;
  316. else if (cfreq[Clatin])
  317. guess = Flatin;
  318. else if (cfreq[Ceascii])
  319. guess = Feascii;
  320. else if (cfreq[Cnull] == n) {
  321. print(mime ? OCTET : "first block all null bytes\n");
  322. return;
  323. }
  324. else guess = Fascii;
  325. /*
  326. * lookup dictionary words
  327. */
  328. memset(wfreq, 0, sizeof(wfreq));
  329. if(guess == Fascii || guess == Flatin || guess == Futf)
  330. wordfreq();
  331. /*
  332. * call individual classify routines
  333. */
  334. for(i=0; call[i]; i++)
  335. if((*call[i])())
  336. return;
  337. /*
  338. * if all else fails,
  339. * print out gross classification
  340. */
  341. if (nbuf < 100 && !mime)
  342. print(mime ? PLAIN : "short ");
  343. if (guess == Fascii)
  344. print(mime ? PLAIN : "Ascii\n");
  345. else if (guess == Feascii)
  346. print(mime ? PLAIN : "extended ascii\n");
  347. else if (guess == Flatin)
  348. print(mime ? PLAIN : "latin ascii\n");
  349. else if (guess == Futf && utf_count() < 4)
  350. print_utf();
  351. else print(mime ? OCTET : "binary\n");
  352. }
  353. void
  354. bump_utf_count(Rune r)
  355. {
  356. int low, high, mid;
  357. high = sizeof(language)/sizeof(language[0])-1;
  358. for (low = 0; low < high;) {
  359. mid = (low+high)/2;
  360. if (r >=language[mid].low) {
  361. if (r <= language[mid].high) {
  362. language[mid].count++;
  363. break;
  364. } else low = mid+1;
  365. } else high = mid;
  366. }
  367. }
  368. int
  369. utf_count(void)
  370. {
  371. int i, count;
  372. count = 0;
  373. for (i = 0; language[i].name; i++)
  374. if (language[i].count > 0)
  375. switch (language[i].mode) {
  376. case Normal:
  377. case First:
  378. count++;
  379. break;
  380. default:
  381. break;
  382. }
  383. return count;
  384. }
  385. int
  386. chkascii(void)
  387. {
  388. int i;
  389. for (i = 'a'; i < 'z'; i++)
  390. if (cfreq[i])
  391. return 1;
  392. for (i = 'A'; i < 'Z'; i++)
  393. if (cfreq[i])
  394. return 1;
  395. return 0;
  396. }
  397. int
  398. find_first(char *name)
  399. {
  400. int i;
  401. for (i = 0; language[i].name != 0; i++)
  402. if (language[i].mode == First
  403. && strcmp(language[i].name, name) == 0)
  404. return i;
  405. return -1;
  406. }
  407. void
  408. print_utf(void)
  409. {
  410. int i, printed, j;
  411. if(mime){
  412. print(PLAIN);
  413. return;
  414. }
  415. if (chkascii()) {
  416. printed = 1;
  417. print("Ascii");
  418. } else
  419. printed = 0;
  420. for (i = 0; language[i].name; i++)
  421. if (language[i].count) {
  422. switch(language[i].mode) {
  423. case Multi:
  424. j = find_first(language[i].name);
  425. if (j < 0)
  426. break;
  427. if (language[j].count > 0)
  428. break;
  429. /* Fall through */
  430. case Normal:
  431. case First:
  432. if (printed)
  433. print(" & ");
  434. else printed = 1;
  435. print("%s", language[i].name);
  436. break;
  437. case Shared:
  438. default:
  439. break;
  440. }
  441. }
  442. if(!printed)
  443. print("UTF");
  444. print(" text\n");
  445. }
  446. void
  447. wordfreq(void)
  448. {
  449. int low, high, mid, r;
  450. uchar *p, *p2, c;
  451. p = buf;
  452. for(;;) {
  453. while (p < buf+nbuf && !isalpha(*p))
  454. p++;
  455. if (p >= buf+nbuf)
  456. return;
  457. p2 = p;
  458. while(p < buf+nbuf && isalpha(*p))
  459. p++;
  460. c = *p;
  461. *p = 0;
  462. high = sizeof(dict)/sizeof(dict[0]);
  463. for(low = 0;low < high;) {
  464. mid = (low+high)/2;
  465. r = strcmp(dict[mid].word, (char*)p2);
  466. if(r == 0) {
  467. wfreq[dict[mid].class]++;
  468. break;
  469. }
  470. if(r < 0)
  471. low = mid+1;
  472. else
  473. high = mid;
  474. }
  475. *p++ = c;
  476. }
  477. }
  478. typedef struct Filemagic Filemagic;
  479. struct Filemagic {
  480. ulong x;
  481. ulong mask;
  482. char *desc;
  483. char *mime;
  484. };
  485. Filemagic long0tab[] = {
  486. 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET,
  487. 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET,
  488. 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET,
  489. 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET,
  490. 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET,
  491. 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip",
  492. 070707, 0xFFFF, "cpio archive\n", OCTET,
  493. 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi",
  494. 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg",
  495. 0xfeff0000, 0xffffffff, "utf-32be\n", "text/plain charset=utf-32be",
  496. 0xfffe, 0xffffffff, "utf-32le\n", "text/plain charset=utf-32le",
  497. 0xfeff, 0xffff, "utf-16be\n", "text/plain charset=utf-16be",
  498. 0xfffe, 0xffff, "utf-16le\n", "text/plain charset=utf-16le",
  499. };
  500. int
  501. filemagic(Filemagic *tab, int ntab, ulong x)
  502. {
  503. int i;
  504. for(i=0; i<ntab; i++)
  505. if((x&tab[i].mask) == tab[i].x){
  506. print(mime ? tab[i].mime : tab[i].desc);
  507. return 1;
  508. }
  509. return 0;
  510. }
  511. int
  512. long0(void)
  513. {
  514. long x;
  515. x = LENDIAN(buf);
  516. if(filemagic(long0tab, nelem(long0tab), x))
  517. return 1;
  518. return 0;
  519. }
  520. int
  521. isexec(void)
  522. {
  523. Fhdr f;
  524. seek(fd, 0, 0); /* reposition to start of file */
  525. if(crackhdr(fd, &f)) {
  526. print(mime ? OCTET : "%s\n", f.name);
  527. return 1;
  528. }
  529. return 0;
  530. }
  531. /* from tar.c */
  532. enum { NAMSIZ = 100, TBLOCK = 512 };
  533. union hblock
  534. {
  535. char dummy[TBLOCK];
  536. struct header
  537. {
  538. char name[NAMSIZ];
  539. char mode[8];
  540. char uid[8];
  541. char gid[8];
  542. char size[12];
  543. char mtime[12];
  544. char chksum[8];
  545. char linkflag;
  546. char linkname[NAMSIZ];
  547. /* rest are defined by POSIX's ustar format; see p1003.2b */
  548. char magic[6]; /* "ustar" */
  549. char version[2];
  550. char uname[32];
  551. char gname[32];
  552. char devmajor[8];
  553. char devminor[8];
  554. char prefix[155]; /* if non-null, path = prefix "/" name */
  555. } dbuf;
  556. };
  557. int
  558. checksum(union hblock *hp)
  559. {
  560. int i;
  561. char *cp;
  562. struct header *hdr = &hp->dbuf;
  563. for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
  564. *cp = ' ';
  565. i = 0;
  566. for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
  567. i += *cp & 0xff;
  568. return i;
  569. }
  570. int
  571. istar(void)
  572. {
  573. int chksum;
  574. char tblock[TBLOCK];
  575. union hblock *hp = (union hblock *)tblock;
  576. struct header *hdr = &hp->dbuf;
  577. seek(fd, 0, 0); /* reposition to start of file */
  578. if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
  579. return 0;
  580. chksum = strtol(hdr->chksum, 0, 8);
  581. if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
  582. if (strcmp(hdr->magic, "ustar") == 0)
  583. print(mime? "application/x-ustar\n":
  584. "posix tar archive\n");
  585. else
  586. print(mime? "application/x-tar\n": "tar archive\n");
  587. return 1;
  588. }
  589. return 0;
  590. }
  591. /*
  592. * initial words to classify file
  593. */
  594. struct FILE_STRING
  595. {
  596. char *key;
  597. char *filetype;
  598. int length;
  599. char *mime;
  600. } file_string[] =
  601. {
  602. "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream",
  603. "!<arch>\n", "archive", 8, "application/octet-stream",
  604. "070707", "cpio archive - ascii header", 6, "application/octet-stream",
  605. "#!/bin/rc", "rc executable file", 9, "text/plain",
  606. "#!/bin/sh", "sh executable file", 9, "text/plain",
  607. "%!", "postscript", 2, "application/postscript",
  608. "\004%!", "postscript", 3, "application/postscript",
  609. "x T post", "troff output for post", 8, "application/troff",
  610. "x T Latin1", "troff output for Latin1", 10, "application/troff",
  611. "x T utf", "troff output for UTF", 7, "application/troff",
  612. "x T 202", "troff output for 202", 7, "application/troff",
  613. "x T aps", "troff output for aps", 7, "application/troff",
  614. "GIF", "GIF image", 3, "image/gif",
  615. "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript",
  616. "%PDF", "PDF", 4, "application/pdf",
  617. "<html>\n", "HTML file", 7, "text/html",
  618. "<HTML>\n", "HTML file", 7, "text/html",
  619. "compressed\n", "Compressed image or subfont", 11, "application/octet-stream",
  620. "\111\111\052\000", "tiff", 4, "image/tiff",
  621. "\115\115\000\052", "tiff", 4, "image/tiff",
  622. "\377\330\377\340", "jpeg", 4, "image/jpeg",
  623. "\377\330\377\341", "jpeg", 4, "image/jpeg",
  624. "\377\330\377\333", "jpeg", 4, "image/jpeg",
  625. "BM", "bmp", 2, "image/bmp",
  626. "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream",
  627. "<MakerFile ", "FrameMaker file", 11, "application/framemaker",
  628. "\033%-12345X", "HPJCL file", 9, "application/hpjcl",
  629. "ID3", "mp3 audio with id3", 3, "audio/mpeg",
  630. "\211PNG", "PNG image", 4, "image/png",
  631. "P3\n", "ppm", 3, "image/ppm",
  632. "P6\n", "ppm", 3, "image/ppm",
  633. "/* XPM */\n", "xbm", 10, "image/xbm",
  634. 0,0,0,0
  635. };
  636. int
  637. istring(void)
  638. {
  639. int i;
  640. struct FILE_STRING *p;
  641. for(p = file_string; p->key; p++) {
  642. if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
  643. if(mime)
  644. print("%s\n", p->mime);
  645. else
  646. print("%s\n", p->filetype);
  647. return 1;
  648. }
  649. }
  650. if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
  651. for(i = 5; i < nbuf; i++)
  652. if(buf[i] == '\n')
  653. break;
  654. if(mime)
  655. print(OCTET);
  656. else
  657. print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
  658. return 1;
  659. }
  660. return 0;
  661. }
  662. int
  663. iff(void)
  664. {
  665. if (strncmp((char*)buf, "FORM", 4) == 0 &&
  666. strncmp((char*)buf+8, "AIFF", 4) == 0) {
  667. print("%s\n", mime? "audio/x-aiff": "aiff audio");
  668. return 1;
  669. }
  670. return 0;
  671. }
  672. char* html_string[] =
  673. {
  674. "title",
  675. "body",
  676. "head",
  677. "strong",
  678. "h1",
  679. "h2",
  680. "h3",
  681. "h4",
  682. "h5",
  683. "h6",
  684. "ul",
  685. "li",
  686. "dl",
  687. "br",
  688. "em",
  689. 0,
  690. };
  691. int
  692. ishtml(void)
  693. {
  694. uchar *p, *q;
  695. int i, count;
  696. /* compare strings between '<' and '>' to html table */
  697. count = 0;
  698. p = buf;
  699. for(;;) {
  700. while (p < buf+nbuf && *p != '<')
  701. p++;
  702. p++;
  703. if (p >= buf+nbuf)
  704. break;
  705. if(*p == '/')
  706. p++;
  707. q = p;
  708. while(p < buf+nbuf && *p != '>')
  709. p++;
  710. if (p >= buf+nbuf)
  711. break;
  712. for(i = 0; html_string[i]; i++) {
  713. if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
  714. if(count++ > 4) {
  715. print(mime ? "text/html\n" : "HTML file\n");
  716. return 1;
  717. }
  718. break;
  719. }
  720. }
  721. p++;
  722. }
  723. return 0;
  724. }
  725. char* rfc822_string[] =
  726. {
  727. "from:",
  728. "date:",
  729. "to:",
  730. "subject:",
  731. "received:",
  732. "reply to:",
  733. "sender:",
  734. 0,
  735. };
  736. int
  737. isrfc822(void)
  738. {
  739. char *p, *q, *r;
  740. int i, count;
  741. count = 0;
  742. p = (char*)buf;
  743. for(;;) {
  744. q = strchr(p, '\n');
  745. if(q == nil)
  746. break;
  747. *q = 0;
  748. if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
  749. count++;
  750. *q = '\n';
  751. p = q+1;
  752. continue;
  753. }
  754. *q = '\n';
  755. if(*p != '\t' && *p != ' '){
  756. r = strchr(p, ':');
  757. if(r == 0 || r > q)
  758. break;
  759. for(i = 0; rfc822_string[i]; i++) {
  760. if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
  761. count++;
  762. break;
  763. }
  764. }
  765. }
  766. p = q+1;
  767. }
  768. if(count >= 3){
  769. print(mime ? "message/rfc822\n" : "email file\n");
  770. return 1;
  771. }
  772. return 0;
  773. }
  774. int
  775. ismbox(void)
  776. {
  777. char *p, *q;
  778. p = (char*)buf;
  779. q = strchr(p, '\n');
  780. if(q == nil)
  781. return 0;
  782. *q = 0;
  783. if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
  784. print(mime ? "text/plain\n" : "mail box\n");
  785. return 1;
  786. }
  787. *q = '\n';
  788. return 0;
  789. }
  790. int
  791. iscint(void)
  792. {
  793. int type;
  794. char *name;
  795. Biobuf b;
  796. if(Binit(&b, fd, OREAD) == Beof)
  797. return 0;
  798. seek(fd, 0, 0);
  799. type = objtype(&b, &name);
  800. if(type < 0)
  801. return 0;
  802. if(mime)
  803. print(OCTET);
  804. else
  805. print("%s intermediate\n", name);
  806. return 1;
  807. }
  808. int
  809. isc(void)
  810. {
  811. int n;
  812. n = wfreq[I1];
  813. /*
  814. * includes
  815. */
  816. if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
  817. goto yes;
  818. if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
  819. goto yes;
  820. /*
  821. * declarations
  822. */
  823. if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
  824. goto yes;
  825. /*
  826. * assignments
  827. */
  828. if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
  829. goto yes;
  830. return 0;
  831. yes:
  832. if(mime){
  833. print(PLAIN);
  834. return 1;
  835. }
  836. if(wfreq[Alword] > 0)
  837. print("alef program\n");
  838. else
  839. print("c program\n");
  840. return 1;
  841. }
  842. int
  843. islimbo(void)
  844. {
  845. /*
  846. * includes
  847. */
  848. if(wfreq[Lword] < 4)
  849. return 0;
  850. print(mime ? PLAIN : "limbo program\n");
  851. return 1;
  852. }
  853. int
  854. isas(void)
  855. {
  856. /*
  857. * includes
  858. */
  859. if(wfreq[Aword] < 2)
  860. return 0;
  861. print(mime ? PLAIN : "as program\n");
  862. return 1;
  863. }
  864. /*
  865. * low entropy means encrypted
  866. */
  867. int
  868. ismung(void)
  869. {
  870. int i, bucket[8];
  871. float cs;
  872. if(nbuf < 64)
  873. return 0;
  874. memset(bucket, 0, sizeof(bucket));
  875. for(i=nbuf-64; i<nbuf; i++)
  876. bucket[(buf[i]>>5)&07] += 1;
  877. cs = 0.;
  878. for(i=0; i<8; i++)
  879. cs += (bucket[i]-8)*(bucket[i]-8);
  880. cs /= 8.;
  881. if(cs <= 24.322) {
  882. if(buf[0]==0x1f && buf[1]==0x9d)
  883. print(mime ? OCTET : "compressed\n");
  884. else
  885. if(buf[0]==0x1f && buf[1]==0x8b)
  886. print(mime ? OCTET : "gzip compressed\n");
  887. else
  888. if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h')
  889. print(mime ? OCTET : "bzip2 compressed\n");
  890. else
  891. print(mime ? OCTET : "encrypted\n");
  892. return 1;
  893. }
  894. return 0;
  895. }
  896. /*
  897. * english by punctuation and frequencies
  898. */
  899. int
  900. isenglish(void)
  901. {
  902. int vow, comm, rare, badpun, punct;
  903. char *p;
  904. if(guess != Fascii && guess != Feascii)
  905. return 0;
  906. badpun = 0;
  907. punct = 0;
  908. for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
  909. switch(*p) {
  910. case '.':
  911. case ',':
  912. case ')':
  913. case '%':
  914. case ';':
  915. case ':':
  916. case '?':
  917. punct++;
  918. if(p[1] != ' ' && p[1] != '\n')
  919. badpun++;
  920. }
  921. if(badpun*5 > punct)
  922. return 0;
  923. if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
  924. return 0;
  925. if(2*cfreq[';'] > cfreq['e'])
  926. return 0;
  927. vow = 0;
  928. for(p="AEIOU"; *p; p++) {
  929. vow += cfreq[*p];
  930. vow += cfreq[tolower(*p)];
  931. }
  932. comm = 0;
  933. for(p="ETAION"; *p; p++) {
  934. comm += cfreq[*p];
  935. comm += cfreq[tolower(*p)];
  936. }
  937. rare = 0;
  938. for(p="VJKQXZ"; *p; p++) {
  939. rare += cfreq[*p];
  940. rare += cfreq[tolower(*p)];
  941. }
  942. if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
  943. print(mime ? PLAIN : "English text\n");
  944. return 1;
  945. }
  946. return 0;
  947. }
  948. /*
  949. * pick up a number with
  950. * syntax _*[0-9]+_
  951. */
  952. #define P9BITLEN 12
  953. int
  954. p9bitnum(uchar *bp)
  955. {
  956. int n, c, len;
  957. len = P9BITLEN;
  958. while(*bp == ' ') {
  959. bp++;
  960. len--;
  961. if(len <= 0)
  962. return -1;
  963. }
  964. n = 0;
  965. while(len > 1) {
  966. c = *bp++;
  967. if(!isdigit(c))
  968. return -1;
  969. n = n*10 + c-'0';
  970. len--;
  971. }
  972. if(*bp != ' ')
  973. return -1;
  974. return n;
  975. }
  976. int
  977. depthof(char *s, int *newp)
  978. {
  979. char *es;
  980. int d;
  981. *newp = 0;
  982. es = s+12;
  983. while(s<es && *s==' ')
  984. s++;
  985. if(s == es)
  986. return -1;
  987. if('0'<=*s && *s<='9')
  988. return 1<<strtol(s, 0, 0);
  989. *newp = 1;
  990. d = 0;
  991. while(s<es && *s!=' '){
  992. s++; /* skip letter */
  993. d += strtoul(s, &s, 10);
  994. }
  995. switch(d){
  996. case 32:
  997. case 24:
  998. case 16:
  999. case 8:
  1000. return d;
  1001. }
  1002. return -1;
  1003. }
  1004. int
  1005. isp9bit(void)
  1006. {
  1007. int dep, lox, loy, hix, hiy, px, new;
  1008. ulong t;
  1009. long len;
  1010. char *newlabel;
  1011. newlabel = "old ";
  1012. dep = depthof((char*)buf + 0*P9BITLEN, &new);
  1013. if(new)
  1014. newlabel = "";
  1015. lox = p9bitnum(buf + 1*P9BITLEN);
  1016. loy = p9bitnum(buf + 2*P9BITLEN);
  1017. hix = p9bitnum(buf + 3*P9BITLEN);
  1018. hiy = p9bitnum(buf + 4*P9BITLEN);
  1019. if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
  1020. return 0;
  1021. if(dep < 8){
  1022. px = 8/dep; /* pixels per byte */
  1023. /* set l to number of bytes of data per scan line */
  1024. if(lox >= 0)
  1025. len = (hix+px-1)/px - lox/px;
  1026. else{ /* make positive before divide */
  1027. t = (-lox)+px-1;
  1028. t = (t/px)*px;
  1029. len = (t+hix+px-1)/px;
  1030. }
  1031. }else
  1032. len = (hix-lox)*dep/8;
  1033. len *= (hiy-loy); /* col length */
  1034. len += 5*P9BITLEN; /* size of initial ascii */
  1035. /*
  1036. * for image file, length is non-zero and must match calculation above
  1037. * for /dev/window and /dev/screen the length is always zero
  1038. * for subfont, the subfont header should follow immediately.
  1039. */
  1040. if (len != 0 && mbuf->length == 0) {
  1041. print("%splan 9 image\n", newlabel);
  1042. return 1;
  1043. }
  1044. if (mbuf->length == len) {
  1045. print("%splan 9 image\n", newlabel);
  1046. return 1;
  1047. }
  1048. /* Ghostscript sometimes produces a little extra on the end */
  1049. if (mbuf->length < len+P9BITLEN) {
  1050. print("%splan 9 image\n", newlabel);
  1051. return 1;
  1052. }
  1053. if (p9subfont(buf+len)) {
  1054. print("%ssubfont file\n", newlabel);
  1055. return 1;
  1056. }
  1057. return 0;
  1058. }
  1059. int
  1060. p9subfont(uchar *p)
  1061. {
  1062. int n, h, a;
  1063. /* if image too big, assume it's a subfont */
  1064. if (p+3*P9BITLEN > buf+sizeof(buf))
  1065. return 1;
  1066. n = p9bitnum(p + 0*P9BITLEN); /* char count */
  1067. if (n < 0)
  1068. return 0;
  1069. h = p9bitnum(p + 1*P9BITLEN); /* height */
  1070. if (h < 0)
  1071. return 0;
  1072. a = p9bitnum(p + 2*P9BITLEN); /* ascent */
  1073. if (a < 0)
  1074. return 0;
  1075. return 1;
  1076. }
  1077. #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
  1078. int
  1079. isp9font(void)
  1080. {
  1081. uchar *cp, *p;
  1082. int i, n;
  1083. char pathname[1024];
  1084. cp = buf;
  1085. if (!getfontnum(cp, &cp)) /* height */
  1086. return 0;
  1087. if (!getfontnum(cp, &cp)) /* ascent */
  1088. return 0;
  1089. for (i = 0;; i++) {
  1090. if (!getfontnum(cp, &cp)) /* min */
  1091. break;
  1092. if (!getfontnum(cp, &cp)) /* max */
  1093. return 0;
  1094. while (WHITESPACE(*cp))
  1095. cp++;
  1096. for (p = cp; *cp && !WHITESPACE(*cp); cp++)
  1097. ;
  1098. /* construct a path name, if needed */
  1099. n = 0;
  1100. if (*p != '/' && slash) {
  1101. n = slash-fname+1;
  1102. if (n < sizeof(pathname))
  1103. memcpy(pathname, fname, n);
  1104. else n = 0;
  1105. }
  1106. if (n+cp-p < sizeof(pathname)) {
  1107. memcpy(pathname+n, p, cp-p);
  1108. n += cp-p;
  1109. pathname[n] = 0;
  1110. if (access(pathname, AEXIST) < 0)
  1111. return 0;
  1112. }
  1113. }
  1114. if (i) {
  1115. print(mime ? "text/plain\n" : "font file\n");
  1116. return 1;
  1117. }
  1118. return 0;
  1119. }
  1120. int
  1121. getfontnum(uchar *cp, uchar **rp)
  1122. {
  1123. while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */
  1124. cp++;
  1125. if (*cp < '0' || *cp > '9')
  1126. return 0;
  1127. strtoul((char *)cp, (char **)rp, 0);
  1128. if (!WHITESPACE(**rp))
  1129. return 0;
  1130. return 1;
  1131. }
  1132. int
  1133. isrtf(void)
  1134. {
  1135. if(strstr((char *)buf, "\\rtf1")){
  1136. print(mime ? "application/rtf\n" : "rich text format\n");
  1137. return 1;
  1138. }
  1139. return 0;
  1140. }
  1141. int
  1142. ismsdos(void)
  1143. {
  1144. if (buf[0] == 0x4d && buf[1] == 0x5a){
  1145. print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
  1146. return 1;
  1147. }
  1148. return 0;
  1149. }
  1150. int
  1151. iself(void)
  1152. {
  1153. static char *cpu[] = { /* NB: incomplete and arbitary list */
  1154. [1] "WE32100",
  1155. [2] "SPARC",
  1156. [3] "i386",
  1157. [4] "M68000",
  1158. [5] "M88000",
  1159. [6] "i486",
  1160. [7] "i860",
  1161. [8] "R3000",
  1162. [9] "S370",
  1163. [10] "R4000",
  1164. [15] "HP-PA",
  1165. [18] "sparc v8+",
  1166. [19] "i960",
  1167. [20] "PPC-32",
  1168. [21] "PPC-64",
  1169. [40] "ARM",
  1170. [41] "Alpha",
  1171. [43] "sparc v9",
  1172. [50] "IA-46",
  1173. [62] "AMD64",
  1174. [75] "VAX",
  1175. };
  1176. static char *type[] = {
  1177. [1] "relocatable object",
  1178. [2] "executable",
  1179. [3] "shared library",
  1180. [4] "core dump",
  1181. };
  1182. if (memcmp(buf, "\x7fELF", 4) == 0){
  1183. if (!mime){
  1184. int n = (buf[19] << 8) | buf[18];
  1185. char *p = "unknown";
  1186. char *t = "unknown";
  1187. if (n > 0 && n < nelem(cpu) && cpu[n])
  1188. p = cpu[n];
  1189. else {
  1190. /* try the other byte order */
  1191. n = (buf[18] << 8) | buf[19];
  1192. if (n > 0 && n < nelem(cpu) && cpu[n])
  1193. p = cpu[n];
  1194. }
  1195. n = buf[16];
  1196. if(n>0 && n < nelem(type) && type[n])
  1197. t = type[n];
  1198. print("%s ELF %s\n", p, t);
  1199. }
  1200. else
  1201. print("application/x-elf-executable");
  1202. return 1;
  1203. }
  1204. return 0;
  1205. }
  1206. int
  1207. isface(void)
  1208. {
  1209. int i, j, ldepth, l;
  1210. char *p;
  1211. ldepth = -1;
  1212. for(j = 0; j < 3; j++){
  1213. for(p = (char*)buf, i=0; i<3; i++){
  1214. if(p[0] != '0' || p[1] != 'x')
  1215. return 0;
  1216. if(buf[2+8] == ',')
  1217. l = 2;
  1218. else if(buf[2+4] == ',')
  1219. l = 1;
  1220. else
  1221. return 0;
  1222. if(ldepth == -1)
  1223. ldepth = l;
  1224. if(l != ldepth)
  1225. return 0;
  1226. strtoul(p, &p, 16);
  1227. if(*p++ != ',')
  1228. return 0;
  1229. while(*p == ' ' || *p == '\t')
  1230. p++;
  1231. }
  1232. if (*p++ != '\n')
  1233. return 0;
  1234. }
  1235. if(mime)
  1236. print("application/x-face\n");
  1237. else
  1238. print("face image depth %d\n", ldepth);
  1239. return 1;
  1240. }