file.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <ctype.h>
  5. #include <mach.h>
  6. /*
  7. * file - determine type of file
  8. */
  9. #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
  10. uchar buf[6001];
  11. short cfreq[140];
  12. short wfreq[50];
  13. int nbuf;
  14. Dir* mbuf;
  15. int fd;
  16. char *fname;
  17. char *slash;
  18. enum
  19. {
  20. Cword,
  21. Fword,
  22. Aword,
  23. Alword,
  24. Lword,
  25. I1,
  26. I2,
  27. I3,
  28. Clatin = 128,
  29. Cbinary,
  30. Cnull,
  31. Ceascii,
  32. Cutf,
  33. };
  34. struct
  35. {
  36. char* word;
  37. int class;
  38. } dict[] =
  39. {
  40. "PATH", Lword,
  41. "TEXT", Aword,
  42. "adt", Alword,
  43. "aggr", Alword,
  44. "alef", Alword,
  45. "array", Lword,
  46. "block", Fword,
  47. "chan", Alword,
  48. "char", Cword,
  49. "common", Fword,
  50. "con", Lword,
  51. "data", Fword,
  52. "dimension", Fword,
  53. "double", Cword,
  54. "extern", Cword,
  55. "bio", I2,
  56. "float", Cword,
  57. "fn", Lword,
  58. "function", Fword,
  59. "h", I3,
  60. "implement", Lword,
  61. "import", Lword,
  62. "include", I1,
  63. "int", Cword,
  64. "integer", Fword,
  65. "iota", Lword,
  66. "libc", I2,
  67. "long", Cword,
  68. "module", Lword,
  69. "real", Fword,
  70. "ref", Lword,
  71. "register", Cword,
  72. "self", Lword,
  73. "short", Cword,
  74. "static", Cword,
  75. "stdio", I2,
  76. "struct", Cword,
  77. "subroutine", Fword,
  78. "u", I2,
  79. "void", Cword,
  80. };
  81. /* codes for 'mode' field in language structure */
  82. enum {
  83. Normal = 0,
  84. First, /* first entry for language spanning several ranges */
  85. Multi, /* later entries " " " ... */
  86. Shared, /* codes used in several languages */
  87. };
  88. struct
  89. {
  90. int mode; /* see enum above */
  91. int count;
  92. int low;
  93. int high;
  94. char *name;
  95. } language[] =
  96. {
  97. Normal, 0, 0x0080, 0x0080, "Extended Latin",
  98. Normal, 0, 0x0100, 0x01FF, "Extended Latin",
  99. Normal, 0, 0x0370, 0x03FF, "Greek",
  100. Normal, 0, 0x0400, 0x04FF, "Cyrillic",
  101. Normal, 0, 0x0530, 0x058F, "Armenian",
  102. Normal, 0, 0x0590, 0x05FF, "Hebrew",
  103. Normal, 0, 0x0600, 0x06FF, "Arabic",
  104. Normal, 0, 0x0900, 0x097F, "Devanagari",
  105. Normal, 0, 0x0980, 0x09FF, "Bengali",
  106. Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi",
  107. Normal, 0, 0x0A80, 0x0AFF, "Gujarati",
  108. Normal, 0, 0x0B00, 0x0B7F, "Oriya",
  109. Normal, 0, 0x0B80, 0x0BFF, "Tamil",
  110. Normal, 0, 0x0C00, 0x0C7F, "Telugu",
  111. Normal, 0, 0x0C80, 0x0CFF, "Kannada",
  112. Normal, 0, 0x0D00, 0x0D7F, "Malayalam",
  113. Normal, 0, 0x0E00, 0x0E7F, "Thai",
  114. Normal, 0, 0x0E80, 0x0EFF, "Lao",
  115. Normal, 0, 0x1000, 0x105F, "Tibetan",
  116. Normal, 0, 0x10A0, 0x10FF, "Georgian",
  117. Normal, 0, 0x3040, 0x30FF, "Japanese",
  118. Normal, 0, 0x3100, 0x312F, "Chinese",
  119. First, 0, 0x3130, 0x318F, "Korean",
  120. Multi, 0, 0x3400, 0x3D2F, "Korean",
  121. Shared, 0, 0x4e00, 0x9fff, "CJK",
  122. Normal, 0, 0, 0, 0, /* terminal entry */
  123. };
  124. enum
  125. {
  126. Fascii, /* printable ascii */
  127. Flatin, /* latin 1*/
  128. Futf, /* UTf character set */
  129. Fbinary, /* binary */
  130. Feascii, /* ASCII with control chars */
  131. Fnull, /* NULL in file */
  132. } guess;
  133. void bump_utf_count(Rune);
  134. int cistrncmp(char*, char*, int);
  135. void filetype(int);
  136. int getfontnum(uchar*, uchar**);
  137. int isas(void);
  138. int isc(void);
  139. int iscint(void);
  140. int isenglish(void);
  141. int ishp(void);
  142. int ishtml(void);
  143. int isrfc822(void);
  144. int ismbox(void);
  145. int islimbo(void);
  146. int ismung(void);
  147. int isp9bit(void);
  148. int isp9font(void);
  149. int istring(void);
  150. int long0(void);
  151. int p9bitnum(uchar*);
  152. int p9subfont(uchar*);
  153. void print_utf(void);
  154. void type(char*, int);
  155. int utf_count(void);
  156. void wordfreq(void);
  157. int (*call[])(void) =
  158. {
  159. long0, /* recognizable by first 4 bytes */
  160. istring, /* recognizable by first string */
  161. ishtml, /* html keywords */
  162. isrfc822, /* email file */
  163. ismbox, /* mail box */
  164. iscint, /* compiler/assembler intermediate */
  165. islimbo, /* limbo source */
  166. isc, /* c & alef compiler key words */
  167. isas, /* assembler key words */
  168. ismung, /* entropy compressed/encrypted */
  169. isp9font, /* plan 9 font */
  170. isp9bit, /* plan 9 image (as from /dev/window) */
  171. isenglish, /* char frequency English */
  172. ishp, /* HP Job Control Language - Postscript */
  173. 0
  174. };
  175. int mime;
  176. #define OCTET "application/octet-stream\n"
  177. #define PLAIN "text/plain\n"
  178. void
  179. main(int argc, char *argv[])
  180. {
  181. int i, j, maxlen;
  182. char *cp;
  183. Rune r;
  184. ARGBEGIN{
  185. case 'm':
  186. mime = 1;
  187. break;
  188. default:
  189. fprint(2, "usage: file [-m] [file...]\n");
  190. exits("usage");
  191. }ARGEND;
  192. maxlen = 0;
  193. if(mime == 0 || argc > 1){
  194. for(i = 0; i < argc; i++) {
  195. for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
  196. ;
  197. if(j > maxlen)
  198. maxlen = j;
  199. }
  200. }
  201. if (argc <= 0) {
  202. if(!mime)
  203. print ("stdin: ");
  204. filetype(0);
  205. }
  206. else {
  207. for(i = 0; i < argc; i++)
  208. type(argv[i], maxlen);
  209. }
  210. exits(0);
  211. }
  212. void
  213. type(char *file, int nlen)
  214. {
  215. Rune r;
  216. int i;
  217. char *p;
  218. if(nlen > 0){
  219. slash = 0;
  220. for (i = 0, p = file; *p; i++) {
  221. if (*p == '/') /* find rightmost slash */
  222. slash = p;
  223. p += chartorune(&r, p); /* count runes */
  224. }
  225. print("%s:%*s",file, nlen-i+1, "");
  226. }
  227. fname = file;
  228. if ((fd = open(file, OREAD)) < 0) {
  229. print("cannot open\n");
  230. return;
  231. }
  232. filetype(fd);
  233. close(fd);
  234. }
  235. void
  236. filetype(int fd)
  237. {
  238. Rune r;
  239. int i, f, n;
  240. char *p, *eob;
  241. free(mbuf);
  242. mbuf = dirfstat(fd);
  243. if(mbuf == nil){
  244. print("cannot stat: %r\n");
  245. return;
  246. }
  247. if(mbuf->mode & DMDIR) {
  248. print(mime ? "text/directory\n" : "directory\n");
  249. return;
  250. }
  251. if(mbuf->type != 'M' && mbuf->type != '|') {
  252. print(mime ? OCTET : "special file #%c/%s\n",
  253. mbuf->type, mbuf->name);
  254. return;
  255. }
  256. nbuf = read(fd, buf, sizeof(buf)-1);
  257. if(nbuf < 0) {
  258. print("cannot read\n");
  259. return;
  260. }
  261. if(nbuf == 0) {
  262. print(mime ? PLAIN : "empty file\n");
  263. return;
  264. }
  265. buf[nbuf] = 0;
  266. /*
  267. * build histogram table
  268. */
  269. memset(cfreq, 0, sizeof(cfreq));
  270. for (i = 0; language[i].name; i++)
  271. language[i].count = 0;
  272. eob = (char *)buf+nbuf;
  273. for(n = 0, p = (char *)buf; p < eob; n++) {
  274. if (!fullrune(p, eob-p) && eob-p < UTFmax)
  275. break;
  276. p += chartorune(&r, p);
  277. if (r == 0)
  278. f = Cnull;
  279. else if (r <= 0x7f) {
  280. if (!isprint(r) && !isspace(r))
  281. f = Ceascii; /* ASCII control char */
  282. else f = r;
  283. } else if (r == 0x080) {
  284. bump_utf_count(r);
  285. f = Cutf;
  286. } else if (r < 0xA0)
  287. f = Cbinary; /* Invalid Runes */
  288. else if (r <= 0xff)
  289. f = Clatin; /* Latin 1 */
  290. else {
  291. bump_utf_count(r);
  292. f = Cutf; /* UTF extension */
  293. }
  294. cfreq[f]++; /* ASCII chars peg directly */
  295. }
  296. /*
  297. * gross classify
  298. */
  299. if (cfreq[Cbinary])
  300. guess = Fbinary;
  301. else if (cfreq[Cutf])
  302. guess = Futf;
  303. else if (cfreq[Clatin])
  304. guess = Flatin;
  305. else if (cfreq[Ceascii])
  306. guess = Feascii;
  307. else if (cfreq[Cnull] == n) {
  308. print(mime ? OCTET : "first block all null bytes\n");
  309. return;
  310. }
  311. else guess = Fascii;
  312. /*
  313. * lookup dictionary words
  314. */
  315. memset(wfreq, 0, sizeof(wfreq));
  316. if(guess == Fascii || guess == Flatin || guess == Futf)
  317. wordfreq();
  318. /*
  319. * call individual classify routines
  320. */
  321. for(i=0; call[i]; i++)
  322. if((*call[i])())
  323. return;
  324. /*
  325. * if all else fails,
  326. * print out gross classification
  327. */
  328. if (nbuf < 100 && !mime)
  329. print(mime ? PLAIN : "short ");
  330. if (guess == Fascii)
  331. print(mime ? PLAIN : "Ascii\n");
  332. else if (guess == Feascii)
  333. print(mime ? PLAIN : "extended ascii\n");
  334. else if (guess == Flatin)
  335. print(mime ? PLAIN : "latin ascii\n");
  336. else if (guess == Futf && utf_count() < 4)
  337. print_utf();
  338. else print(mime ? OCTET : "binary\n");
  339. }
  340. void
  341. bump_utf_count(Rune r)
  342. {
  343. int low, high, mid;
  344. high = sizeof(language)/sizeof(language[0])-1;
  345. for (low = 0; low < high;) {
  346. mid = (low+high)/2;
  347. if (r >=language[mid].low) {
  348. if (r <= language[mid].high) {
  349. language[mid].count++;
  350. break;
  351. } else low = mid+1;
  352. } else high = mid;
  353. }
  354. }
  355. int
  356. utf_count(void)
  357. {
  358. int i, count;
  359. count = 0;
  360. for (i = 0; language[i].name; i++)
  361. if (language[i].count > 0)
  362. switch (language[i].mode) {
  363. case Normal:
  364. case First:
  365. count++;
  366. break;
  367. default:
  368. break;
  369. }
  370. return count;
  371. }
  372. int
  373. chkascii(void)
  374. {
  375. int i;
  376. for (i = 'a'; i < 'z'; i++)
  377. if (cfreq[i])
  378. return 1;
  379. for (i = 'A'; i < 'Z'; i++)
  380. if (cfreq[i])
  381. return 1;
  382. return 0;
  383. }
  384. int
  385. find_first(char *name)
  386. {
  387. int i;
  388. for (i = 0; language[i].name != 0; i++)
  389. if (language[i].mode == First
  390. && strcmp(language[i].name, name) == 0)
  391. return i;
  392. return -1;
  393. }
  394. void
  395. print_utf(void)
  396. {
  397. int i, printed, j;
  398. if(mime){
  399. print(PLAIN);
  400. return;
  401. }
  402. if (chkascii()) {
  403. printed = 1;
  404. print("Ascii");
  405. } else
  406. printed = 0;
  407. for (i = 0; language[i].name; i++)
  408. if (language[i].count) {
  409. switch(language[i].mode) {
  410. case Multi:
  411. j = find_first(language[i].name);
  412. if (j < 0)
  413. break;
  414. if (language[j].count > 0)
  415. break;
  416. /* Fall through */
  417. case Normal:
  418. case First:
  419. if (printed)
  420. print(" & ");
  421. else printed = 1;
  422. print("%s", language[i].name);
  423. break;
  424. case Shared:
  425. default:
  426. break;
  427. }
  428. }
  429. if(!printed)
  430. print("UTF");
  431. print(" text\n");
  432. }
  433. void
  434. wordfreq(void)
  435. {
  436. int low, high, mid, r;
  437. uchar *p, *p2, c;
  438. p = buf;
  439. for(;;) {
  440. while (p < buf+nbuf && !isalpha(*p))
  441. p++;
  442. if (p >= buf+nbuf)
  443. return;
  444. p2 = p;
  445. while(p < buf+nbuf && isalpha(*p))
  446. p++;
  447. c = *p;
  448. *p = 0;
  449. high = sizeof(dict)/sizeof(dict[0]);
  450. for(low = 0;low < high;) {
  451. mid = (low+high)/2;
  452. r = strcmp(dict[mid].word, (char*)p2);
  453. if(r == 0) {
  454. wfreq[dict[mid].class]++;
  455. break;
  456. }
  457. if(r < 0)
  458. low = mid+1;
  459. else
  460. high = mid;
  461. }
  462. *p++ = c;
  463. }
  464. }
  465. typedef struct Filemagic Filemagic;
  466. struct Filemagic {
  467. ulong x;
  468. ulong mask;
  469. char *desc;
  470. char *mime;
  471. };
  472. Filemagic long0tab[] = {
  473. 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET,
  474. 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET,
  475. 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET,
  476. 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET,
  477. 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET,
  478. 0x04034B50, 0xFFFFFFFF, "zip archive\n", OCTET,
  479. 070707, 0xFFFF, "cpio archive\n", OCTET,
  480. 0x2F7, 0xFFFF, "tex dvi\n", OCTET,
  481. };
  482. int
  483. filemagic(Filemagic *tab, int ntab, ulong x)
  484. {
  485. int i;
  486. for(i=0; i<ntab; i++)
  487. if((x&tab[i].mask) == tab[i].x){
  488. print(mime ? tab[i].mime : tab[i].desc);
  489. return 1;
  490. }
  491. return 0;
  492. }
  493. int
  494. long0(void)
  495. {
  496. Fhdr f;
  497. long x;
  498. seek(fd, 0, 0); /* reposition to start of file */
  499. if(crackhdr(fd, &f)) {
  500. print(mime ? OCTET : "%s\n", f.name);
  501. return 1;
  502. }
  503. x = LENDIAN(buf);
  504. if(filemagic(long0tab, nelem(long0tab), x))
  505. return 1;
  506. return 0;
  507. }
  508. /*
  509. * initial words to classify file
  510. */
  511. struct FILE_STRING
  512. {
  513. char *key;
  514. char *filetype;
  515. int length;
  516. char *mime;
  517. } file_string[] =
  518. {
  519. "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream",
  520. "!<arch>\n", "archive", 8, "application/octet-stream",
  521. "070707", "cpio archive - ascii header", 6, "application/octet-stream",
  522. "#!/bin/rc", "rc executable file", 9, "text/plain",
  523. "#!/bin/sh", "sh executable file", 9, "text/plain",
  524. "%!", "postscript", 2, "application/postscript",
  525. "\004%!", "postscript", 3, "application/postscript",
  526. "x T post", "troff output for post", 8, "application/troff",
  527. "x T Latin1", "troff output for Latin1", 10, "application/troff",
  528. "x T utf", "troff output for UTF", 7, "application/troff",
  529. "x T 202", "troff output for 202", 7, "application/troff",
  530. "x T aps", "troff output for aps", 7, "application/troff",
  531. "GIF", "GIF image", 3, "image/gif",
  532. "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript",
  533. "%PDF", "PDF", 4, "application/pdf",
  534. "<html>\n", "HTML file", 7, "text/html",
  535. "<HTML>\n", "HTML file", 7, "text/html",
  536. "compressed\n", "Compressed image or subfont", 11, "application/octet-stream",
  537. "\111\111\052\000", "tiff", 4, "image/tiff",
  538. "\115\115\000\052", "tiff", 4, "image/tiff",
  539. "\377\330\377\340", "jpeg", 4, "image/jpeg",
  540. "\377\330\377\341", "jpeg", 4, "image/jpeg",
  541. "\377\330\377\333", "jpeg", 4, "image/jpeg",
  542. "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream",
  543. 0,0,0,0
  544. };
  545. int
  546. istring(void)
  547. {
  548. int i;
  549. struct FILE_STRING *p;
  550. for(p = file_string; p->key; p++) {
  551. if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
  552. if(mime)
  553. print("%s\n", p->mime);
  554. else
  555. print("%s\n", p->filetype);
  556. return 1;
  557. }
  558. }
  559. if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
  560. for(i = 5; i < nbuf; i++)
  561. if(buf[i] == '\n')
  562. break;
  563. if(mime)
  564. print(OCTET);
  565. else
  566. print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
  567. return 1;
  568. }
  569. return 0;
  570. }
  571. char* html_string[] =
  572. {
  573. "title",
  574. "body",
  575. "head",
  576. "strong",
  577. "h1",
  578. "h2",
  579. "h3",
  580. "h4",
  581. "h5",
  582. "h6",
  583. "ul",
  584. "li",
  585. "dl",
  586. "br",
  587. "em",
  588. 0,
  589. };
  590. int
  591. ishtml(void)
  592. {
  593. uchar *p, *q;
  594. int i, count;
  595. /* compare strings between '<' and '>' to html table */
  596. count = 0;
  597. p = buf;
  598. for(;;) {
  599. while (p < buf+nbuf && *p != '<')
  600. p++;
  601. p++;
  602. if (p >= buf+nbuf)
  603. break;
  604. if(*p == '/')
  605. p++;
  606. q = p;
  607. while(p < buf+nbuf && *p != '>')
  608. p++;
  609. if (p >= buf+nbuf)
  610. break;
  611. for(i = 0; html_string[i]; i++) {
  612. if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
  613. if(count++ > 4) {
  614. print(mime ? "text/html\n" : "HTML file\n");
  615. return 1;
  616. }
  617. break;
  618. }
  619. }
  620. p++;
  621. }
  622. return 0;
  623. }
  624. char* rfc822_string[] =
  625. {
  626. "from:",
  627. "date:",
  628. "to:",
  629. "subject:",
  630. "received:",
  631. "reply to:",
  632. "sender:",
  633. 0,
  634. };
  635. int
  636. isrfc822(void)
  637. {
  638. char *p, *q, *r;
  639. int i, count;
  640. count = 0;
  641. p = (char*)buf;
  642. for(;;) {
  643. q = strchr(p, '\n');
  644. if(q == nil)
  645. break;
  646. *q = 0;
  647. if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
  648. count++;
  649. *q = '\n';
  650. p = q+1;
  651. continue;
  652. }
  653. *q = '\n';
  654. if(*p != '\t' && *p != ' '){
  655. r = strchr(p, ':');
  656. if(r == 0 || r > q)
  657. break;
  658. for(i = 0; rfc822_string[i]; i++) {
  659. if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
  660. count++;
  661. break;
  662. }
  663. }
  664. }
  665. p = q+1;
  666. }
  667. if(count >= 3){
  668. print(mime ? "message/rfc822\n" : "email file\n");
  669. return 1;
  670. }
  671. return 0;
  672. }
  673. int
  674. ismbox(void)
  675. {
  676. char *p, *q;
  677. p = (char*)buf;
  678. q = strchr(p, '\n');
  679. if(q == nil)
  680. return 0;
  681. *q = 0;
  682. if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
  683. print(mime ? "text/plain\n" : "mail box\n");
  684. return 1;
  685. }
  686. *q = '\n';
  687. return 0;
  688. }
  689. int
  690. iscint(void)
  691. {
  692. int type;
  693. char *name;
  694. Biobuf b;
  695. if(Binit(&b, fd, OREAD) == Beof)
  696. return 0;
  697. seek(fd, 0, 0);
  698. type = objtype(&b, &name);
  699. if(type < 0)
  700. return 0;
  701. if(mime)
  702. print(OCTET);
  703. else
  704. print("%s intermediate\n", name);
  705. return 1;
  706. }
  707. int
  708. isc(void)
  709. {
  710. int n;
  711. n = wfreq[I1];
  712. /*
  713. * includes
  714. */
  715. if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
  716. goto yes;
  717. if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
  718. goto yes;
  719. /*
  720. * declarations
  721. */
  722. if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
  723. goto yes;
  724. /*
  725. * assignments
  726. */
  727. if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
  728. goto yes;
  729. return 0;
  730. yes:
  731. if(mime){
  732. print(PLAIN);
  733. return 1;
  734. }
  735. if(wfreq[Alword] > 0)
  736. print("alef program\n");
  737. else
  738. print("c program\n");
  739. return 1;
  740. }
  741. int
  742. islimbo(void)
  743. {
  744. /*
  745. * includes
  746. */
  747. if(wfreq[Lword] < 4)
  748. return 0;
  749. print(mime ? PLAIN : "limbo program\n");
  750. return 1;
  751. }
  752. int
  753. isas(void)
  754. {
  755. /*
  756. * includes
  757. */
  758. if(wfreq[Aword] < 2)
  759. return 0;
  760. print(mime ? PLAIN : "as program\n");
  761. return 1;
  762. }
  763. /*
  764. * low entropy means encrypted
  765. */
  766. int
  767. ismung(void)
  768. {
  769. int i, bucket[8];
  770. float cs;
  771. if(nbuf < 64)
  772. return 0;
  773. memset(bucket, 0, sizeof(bucket));
  774. for(i=0; i<64; i++)
  775. bucket[(buf[i]>>5)&07] += 1;
  776. cs = 0.;
  777. for(i=0; i<8; i++)
  778. cs += (bucket[i]-8)*(bucket[i]-8);
  779. cs /= 8.;
  780. if(cs <= 24.322) {
  781. if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
  782. print(mime ? OCTET : "compressed\n");
  783. else
  784. print(mime ? OCTET : "encrypted\n");
  785. return 1;
  786. }
  787. return 0;
  788. }
  789. /*
  790. * english by punctuation and frequencies
  791. */
  792. int
  793. isenglish(void)
  794. {
  795. int vow, comm, rare, badpun, punct;
  796. char *p;
  797. if(guess != Fascii && guess != Feascii)
  798. return 0;
  799. badpun = 0;
  800. punct = 0;
  801. for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
  802. switch(*p) {
  803. case '.':
  804. case ',':
  805. case ')':
  806. case '%':
  807. case ';':
  808. case ':':
  809. case '?':
  810. punct++;
  811. if(p[1] != ' ' && p[1] != '\n')
  812. badpun++;
  813. }
  814. if(badpun*5 > punct)
  815. return 0;
  816. if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
  817. return 0;
  818. if(2*cfreq[';'] > cfreq['e'])
  819. return 0;
  820. vow = 0;
  821. for(p="AEIOU"; *p; p++) {
  822. vow += cfreq[*p];
  823. vow += cfreq[tolower(*p)];
  824. }
  825. comm = 0;
  826. for(p="ETAION"; *p; p++) {
  827. comm += cfreq[*p];
  828. comm += cfreq[tolower(*p)];
  829. }
  830. rare = 0;
  831. for(p="VJKQXZ"; *p; p++) {
  832. rare += cfreq[*p];
  833. rare += cfreq[tolower(*p)];
  834. }
  835. if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
  836. print(mime ? PLAIN : "English text\n");
  837. return 1;
  838. }
  839. return 0;
  840. }
  841. /*
  842. * pick up a number with
  843. * syntax _*[0-9]+_
  844. */
  845. #define P9BITLEN 12
  846. int
  847. p9bitnum(uchar *bp)
  848. {
  849. int n, c, len;
  850. len = P9BITLEN;
  851. while(*bp == ' ') {
  852. bp++;
  853. len--;
  854. if(len <= 0)
  855. return -1;
  856. }
  857. n = 0;
  858. while(len > 1) {
  859. c = *bp++;
  860. if(!isdigit(c))
  861. return -1;
  862. n = n*10 + c-'0';
  863. len--;
  864. }
  865. if(*bp != ' ')
  866. return -1;
  867. return n;
  868. }
  869. int
  870. depthof(char *s, int *newp)
  871. {
  872. char *es;
  873. int d;
  874. *newp = 0;
  875. es = s+12;
  876. while(s<es && *s==' ')
  877. s++;
  878. if(s == es)
  879. return -1;
  880. if('0'<=*s && *s<='9')
  881. return 1<<atoi(s);
  882. *newp = 1;
  883. d = 0;
  884. while(s<es && *s!=' '){
  885. s++; /* skip letter */
  886. d += strtoul(s, &s, 10);
  887. }
  888. switch(d){
  889. case 32:
  890. case 24:
  891. case 16:
  892. case 8:
  893. return d;
  894. }
  895. return -1;
  896. }
  897. int
  898. isp9bit(void)
  899. {
  900. int dep, lox, loy, hix, hiy, px, new;
  901. ulong t;
  902. long len;
  903. char *newlabel;
  904. newlabel = "old ";
  905. dep = depthof((char*)buf + 0*P9BITLEN, &new);
  906. if(new)
  907. newlabel = "";
  908. lox = p9bitnum(buf + 1*P9BITLEN);
  909. loy = p9bitnum(buf + 2*P9BITLEN);
  910. hix = p9bitnum(buf + 3*P9BITLEN);
  911. hiy = p9bitnum(buf + 4*P9BITLEN);
  912. if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
  913. return 0;
  914. if(dep < 8){
  915. px = 8/dep; /* pixels per byte */
  916. /* set l to number of bytes of data per scan line */
  917. if(lox >= 0)
  918. len = (hix+px-1)/px - lox/px;
  919. else{ /* make positive before divide */
  920. t = (-lox)+px-1;
  921. t = (t/px)*px;
  922. len = (t+hix+px-1)/px;
  923. }
  924. }else
  925. len = (hix-lox)*dep/8;
  926. len *= (hiy-loy); /* col length */
  927. len += 5*P9BITLEN; /* size of initial ascii */
  928. /*
  929. * for image file, length is non-zero and must match calculation above
  930. * for /dev/window and /dev/screen the length is always zero
  931. * for subfont, the subfont header should follow immediately.
  932. */
  933. if (len != 0 && mbuf->length == 0) {
  934. print("%splan 9 image\n", newlabel);
  935. return 1;
  936. }
  937. if (mbuf->length == len) {
  938. print("%splan 9 image\n", newlabel);
  939. return 1;
  940. }
  941. /* Ghostscript sometimes produces a little extra on the end */
  942. if (mbuf->length < len+P9BITLEN) {
  943. print("%splan 9 image\n", newlabel);
  944. return 1;
  945. }
  946. if (p9subfont(buf+len)) {
  947. print("%ssubfont file\n", newlabel);
  948. return 1;
  949. }
  950. return 0;
  951. }
  952. int
  953. p9subfont(uchar *p)
  954. {
  955. int n, h, a;
  956. /* if image too big, assume it's a subfont */
  957. if (p+3*P9BITLEN > buf+sizeof(buf))
  958. return 1;
  959. n = p9bitnum(p + 0*P9BITLEN); /* char count */
  960. if (n < 0)
  961. return 0;
  962. h = p9bitnum(p + 1*P9BITLEN); /* height */
  963. if (h < 0)
  964. return 0;
  965. a = p9bitnum(p + 2*P9BITLEN); /* ascent */
  966. if (a < 0)
  967. return 0;
  968. return 1;
  969. }
  970. #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
  971. int
  972. isp9font(void)
  973. {
  974. uchar *cp, *p;
  975. int i, n;
  976. char pathname[1024];
  977. cp = buf;
  978. if (!getfontnum(cp, &cp)) /* height */
  979. return 0;
  980. if (!getfontnum(cp, &cp)) /* ascent */
  981. return 0;
  982. for (i = 0; 1; i++) {
  983. if (!getfontnum(cp, &cp)) /* min */
  984. break;
  985. if (!getfontnum(cp, &cp)) /* max */
  986. return 0;
  987. while (WHITESPACE(*cp))
  988. cp++;
  989. for (p = cp; *cp && !WHITESPACE(*cp); cp++)
  990. ;
  991. /* construct a path name, if needed */
  992. n = 0;
  993. if (*p != '/' && slash) {
  994. n = slash-fname+1;
  995. if (n < sizeof(pathname))
  996. memcpy(pathname, fname, n);
  997. else n = 0;
  998. }
  999. if (n+cp-p < sizeof(pathname)) {
  1000. memcpy(pathname+n, p, cp-p);
  1001. n += cp-p;
  1002. pathname[n] = 0;
  1003. if (access(pathname, AEXIST) < 0)
  1004. return 0;
  1005. }
  1006. }
  1007. if (i) {
  1008. print("font file\n");
  1009. return 1;
  1010. }
  1011. return 0;
  1012. }
  1013. int
  1014. getfontnum(uchar *cp, uchar **rp)
  1015. {
  1016. while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */
  1017. cp++;
  1018. if (*cp < '0' || *cp > '9')
  1019. return 0;
  1020. strtoul((char *)cp, (char **)rp, 0);
  1021. if (!WHITESPACE(**rp))
  1022. return 0;
  1023. return 1;
  1024. }
  1025. int
  1026. ishp(void)
  1027. {
  1028. if (strncmp("\033%-12345X", (char *)buf, 9)==0) {
  1029. print("HPJCL file\n");
  1030. return 1;
  1031. }
  1032. return 0;
  1033. }