jpn.c 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322
  1. /*
  2. * CDE - Common Desktop Environment
  3. *
  4. * Copyright (c) 1993-2012, The Open Group. All rights reserved.
  5. *
  6. * These libraries and programs are free software; you can
  7. * redistribute them and/or modify them under the terms of the GNU
  8. * Lesser General Public License as published by the Free Software
  9. * Foundation; either version 2 of the License, or (at your option)
  10. * any later version.
  11. *
  12. * These libraries and programs are distributed in the hope that
  13. * they will be useful, but WITHOUT ANY WARRANTY; without even the
  14. * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  15. * PURPOSE. See the GNU Lesser General Public License for more
  16. * details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with these libraries and programs; if not, write
  20. * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
  21. * Floor, Boston, MA 02110-1301 USA
  22. */
  23. /*
  24. * COMPONENT_NAME: austext
  25. *
  26. * FUNCTIONS: display_jstate
  27. * jpn_parser
  28. * kanji_compounder
  29. * load_jpn_language
  30. * load_jpntree
  31. * parse_substring
  32. * read_jchar
  33. * search_kanjitree
  34. *
  35. * ORIGINS: 27
  36. *
  37. *
  38. * (C) COPYRIGHT International Business Machines Corp. 1995,1996
  39. * All Rights Reserved
  40. * Licensed Materials - Property of IBM
  41. * US Government Users Restricted Rights - Use, duplication or
  42. * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
  43. */
  44. /******************** JPN.C ********************
  45. * $TOG: jpn.c /main/7 1999/10/14 14:11:33 mgreess $
  46. * September 1995.
  47. * Includes functions and data for parsing Japanese,
  48. * supported languages DtSrLaJPN and DtSrLaJPN2.
  49. * Currently only supports EUC packed format,
  50. * but should be easily extendable to Shift-JIS.
  51. * JIS can be supported if half-width katakana are excluded
  52. * (no SI or SO chars to conflict with the ^O stemming char,
  53. * and engine must decide never to balk at ESCape sequences).
  54. * Will not support Unicode or other fixed width, n-wide
  55. * encodings that would conflict with ascii in either byte.
  56. * Does not require wide char or multibyte char functions.
  57. * There is no Japanese stemmer(), ie standard null_stemmer() is used.
  58. *
  59. * Code Set 0 can be either 7-bit ASCII or 7-bit JIS-Roman.
  60. * The parser() for ASCII is the full teskey_parser()
  61. * used for European languages with an ascii char set.
  62. * Min/max word size, stoplists, and include lists may be
  63. * used if provided, as in European languages.
  64. *
  65. * Code Set 1 is JIS X 0208-1990.
  66. * Symbols and line drawing elements are not indexed.
  67. * Hirigana strings are discarded as equivalent to stoplist words.
  68. * Contiguous strings of katakana, Roman, Greek, or cyrillic
  69. * are parsed as single words.
  70. *
  71. * Individual kanji chars are parsed as single words.
  72. * In addition, for language DtSrLaJPN, all kanji compounds
  73. * (pairs, triplets, etc) found in any contiguous string of
  74. * kanjis will be parsed up to a maximum word size
  75. * defined in MAX_KANJI_CLEN (see caveat below).
  76. * For language DtSrLaJPN2, only kanji substrings listed
  77. * in a .knj file are parsed as additional compound words.
  78. * Characters from unassigned kuten rows are presumed to be
  79. * user-defined kanji and are parsed as such.
  80. *
  81. * Code Set 2 is 1/2 width katakana.
  82. * Contiguous strings are parsed as single words.
  83. *
  84. * Code Set 3 is JIS X 0212-1990.
  85. * Parsing is similar to Code Set 1: discard symbols, etc,
  86. * contiguous strings of related foreign characters equal words,
  87. * and individual kanji and unassigned chars equal single words,
  88. * with additional kanji compounding depending on language.
  89. * Row 5 has 4 new katakana (not yet officially approved)
  90. * so it is treated here as katakana.
  91. *
  92. * $Log$
  93. * Revision 2.8 1996/04/10 20:24:33 miker
  94. * Fixed bug in kanji tree loader.
  95. *
  96. * Revision 2.7 1996/03/25 18:55:15 miker
  97. * Changed FILENAME_MAX to _POSIX_PATH_MAX.
  98. *
  99. * Revision 2.6 1996/03/13 22:57:40 miker
  100. * Added prolog. Changed char to UCHAR several places.
  101. *
  102. * Revision 2.5 1996/03/05 16:09:58 miker
  103. * Made jchar array of unsigned chars for compat with Sun compilers.
  104. * Added test of PA_MSGS for yacc-based boolean queries.
  105. *
  106. * Revision 2.4 1996/02/01 19:08:10 miker
  107. * AusText 2.1.11, DtSearch 0.3: Major rewrite for new parsers.
  108. * Made optional power series kanji compounding (KANJI_COMPOUNDS)
  109. * into a new DtSrLaJPN language. Old version now DtSrLaJPN2.
  110. *
  111. * Revision 2.3 1995/12/01 16:20:17 miker
  112. * Changed read_jchar arg to unsigned to fix Solaris bug.
  113. *
  114. * Revision 2.2 1995/10/26 15:08:31 miker
  115. * Added prolog.
  116. *
  117. * Revision 2.1 1995/09/22 20:57:13 miker
  118. * Freeze DtSearch 0.1, AusText 2.1.8
  119. *
  120. * Revision 1.1 1995/09/19 21:24:57 miker
  121. * Initial revision
  122. */
  123. #include "SearchP.h"
  124. #include <limits.h>
  125. #include <stdlib.h>
  126. #include <stdio.h>
  127. #include <string.h>
  128. #include <errno.h>
  129. #include <sys/stat.h>
  130. #define PROGNAME "JPN"
  131. #define SS2_CHAR 0x8E /* Single Shift char for Code Set 2 */
  132. #define SS3_CHAR 0x8F /* Single Shift char for Code Set 3 */
  133. #define EXT_KATAKANA ".ktk"
  134. #define EXT_KANJI ".knj"
  135. #define SUBSTRBUFSZ 100
  136. #define MS_misc 1
  137. #define MS_lang 15
  138. /* In addition to single kanji chars parsed as individual words,
  139. * Language DtSrLaJPN will also blindly consider all contiguous kanji
  140. * substrings up to MAX_KANJI_CLEN as separate compound words.
  141. * For example if MAX_KANJI_CLEN were 3, the 4 kanjis "ABCD"
  142. * would parse as "A B C D AB BC CD ABC BCD".
  143. * The number of parsed words = the number of
  144. * ordered permutations of n things taken r! times!
  145. * This is can be very wasteful of indexing time and file space.
  146. * The alternative is language DtSrLaJPN2 which only considers
  147. * strings listed in jpn.knj as valid kanji compounds.
  148. * The kanji compounds in jpn.knj are the statistically significant
  149. * kanji substrings found in a large corpus of natural language Japanese.
  150. */
  151. #define MAX_KANJI_CLEN 6
  152. /************************************************/
  153. /* */
  154. /* JSTATE */
  155. /* */
  156. /************************************************/
  157. /* EUC text substring types.
  158. * Used to switch states in parser's automaton.
  159. * Coded as bit positions for efficient boolean comparisons.
  160. */
  161. #define JS_STX 0x0001 /* Start of text blk, initial state */
  162. #define JS_KANJI 0x0002 /* Set 1, Set 3 */
  163. #define JS_KATAKANA 0x0004 /* Set 1 */
  164. #define JS_ASCII 0x0008 /* Set 0 */
  165. #define JS_ROMAN 0x0010 /* Set 1 */
  166. #define JS_GREEK 0x0020 /* Set 1, Set 3 */
  167. #define JS_CYRILLIC 0x0040 /* Set 1 */
  168. #define JS_ALPHA 0x0080 /* Set 3 */
  169. #define JS_HALFKATA 0x0100 /* Set 2 */
  170. #define JS_DISCARD 0x0200 /* Set 1, Set 3, any char not in EUC */
  171. #define JS_ETX 0x0300 /* End of text block */
  172. #define JS_ALPHA_COMPATIBLE (JS_ROMAN | JS_GREEK | JS_CYRILLIC)
  173. /************************************************/
  174. /* */
  175. /* JPNTREE */
  176. /* */
  177. /************************************************/
  178. /* Similar to standard binary WORDTREE.
  179. * Each tree node distinguished by first 4 bytes
  180. * (usually 2 jchars), which is minimum compound word size.
  181. * All compounds beginning with those 4 bytes are chained
  182. * in a linked list off of that node.
  183. */
  184. typedef struct _jpntree_tag {
  185. struct _jpntree_tag *rlink; /* ptr to right binary node */
  186. struct _jpntree_tag *llink; /* ptr to left binary node */
  187. struct _jpntree_tag *next; /* ptr to next compound in linked list */
  188. int len; /* length of word in bytes */
  189. void *word;
  190. } JPNTREE;
  191. /************************************************/
  192. /* */
  193. /* JPNBLK */
  194. /* */
  195. /************************************************/
  196. typedef struct {
  197. JPNTREE *katatree;
  198. JPNTREE *kanjitree;
  199. } JPNBLK;
  200. /************************************************/
  201. /* */
  202. /* GLOBALS */
  203. /* */
  204. /************************************************/
  205. int debugging_jpn = FALSE;
  206. extern int debugging_loadlang;
  207. extern int debugging_loadword;
  208. /* Used in jpn_parser() and parse_substr(). Made global for speed. */
  209. static int do_compounding = FALSE;
  210. static int is_new_substring = TRUE;
  211. static int jstate, last_jstate;
  212. static UCHAR jchar [8];
  213. static int jcharlen = 0;
  214. static DBLK *jpn_dblk;
  215. static JPNTREE *jpn_kanjitree = NULL;
  216. static JPNTREE *jpn_katatree = NULL;
  217. static JPNTREE *kanjitree = NULL;
  218. static int language;
  219. static long *offsetp;
  220. static long readcount = 0;
  221. static READCFP readchar;
  222. static void *readchar_arg;
  223. static UCHAR *outbuf = NULL;
  224. static UCHAR *save_parg_string = NULL;
  225. static UCHAR *substrbuf = NULL;
  226. static long substr_offset;
  227. char *ensure_end_slash (char *pathstr);
  228. /************************************************/
  229. /* */
  230. /* display_jstate */
  231. /* */
  232. /************************************************/
  233. /* for debugging and error msgs */
  234. static char *display_jstate (int js)
  235. {
  236. switch (js) {
  237. case JS_KANJI: return "KANJI";
  238. case JS_KATAKANA: return "KATAKANA";
  239. case JS_DISCARD: return "DISCARD";
  240. case JS_ROMAN: return "ROMAN";
  241. case JS_ASCII: return "ASCII";
  242. case JS_ALPHA: return "ALPHA";
  243. case JS_ETX: return "ETX";
  244. case JS_STX: return "STX";
  245. case JS_GREEK: return "GREEK";
  246. case JS_CYRILLIC: return "CYRILLIC";
  247. case JS_HALFKATA: return "HALFKATA";
  248. default: return "(UNKNOWN)";
  249. }
  250. } /* display_jstate() */
  251. /************************************************/
  252. /* */
  253. /* read_jchar */
  254. /* */
  255. /************************************************/
  256. /* Subroutine of jpn_parser().
  257. * Using global character reading 'readchar' cofunction,
  258. * returns (1) next multibyte Japanese character in global jchar,
  259. * (2) length of jchar in global jcharlen, and
  260. * (3) next state of state machine in global jstate.
  261. * Function itself returns jstate.
  262. * Rows in the KUTEN tables which are officially 'unassigned'
  263. * are treated as user-defined kanji, so all jstates
  264. * are presumed JS_KANJI except those specifically marked
  265. * otherwise at the beginning of each array below.
  266. */
  267. static int read_jchar (void)
  268. {
  269. /* Jstates table for EUC Set 1 (JIS 0208) */
  270. static int jstates_set1 [] = {
  271. JS_DISCARD, JS_DISCARD, JS_DISCARD, /* 0 - 2 */
  272. JS_ROMAN, JS_DISCARD, JS_KATAKANA, /* 3 - 5 */
  273. JS_GREEK, JS_CYRILLIC, JS_DISCARD /* 6 - 8 */
  274. };
  275. /* Jstates table for EUC Set 3 (JIS 0212).
  276. * Row 5 is presumed to be katakana because
  277. * of four new unapproved katakana characters.
  278. */
  279. static int jstates_set3 [] = {
  280. JS_DISCARD, JS_DISCARD, JS_DISCARD, /* 0 - 2 */
  281. JS_DISCARD, JS_DISCARD, JS_KATAKANA, /* 3 - 5 */
  282. JS_GREEK, JS_CYRILLIC, JS_DISCARD, /* 6 - 8 */
  283. JS_ALPHA, JS_ALPHA, JS_ALPHA /* 9 - 11 */
  284. };
  285. if (readchar_arg) {
  286. jchar[0] = readchar (readchar_arg);
  287. readchar_arg = NULL;
  288. }
  289. else
  290. jchar[0] = readchar (NULL);
  291. if (jchar[0] == 0)
  292. return (jstate = JS_ETX);
  293. readcount++;
  294. /* Set 1 (JIS 0208) */
  295. if (jchar[0] >= 0xA1 && jchar[0] <= 0xFE) {
  296. jcharlen = 2;
  297. if (jchar[0] > 0xA8)
  298. jstate = JS_KANJI;
  299. else
  300. jstate = jstates_set1 [(jchar[0] & 0x7F) - 32];
  301. if ((jchar[1] = readchar (NULL)))
  302. readcount++;
  303. else
  304. jstate = JS_ETX;
  305. return jstate;
  306. }
  307. /* Set 0 (ASCII) */
  308. if (jchar[0] < 0x80) {
  309. jcharlen = 1;
  310. return (jstate = JS_ASCII);
  311. }
  312. /* Set 3 (JIS 0212) */
  313. if (jchar[0] == SS3_CHAR) {
  314. jcharlen = 3;
  315. /*
  316. * Hop over the single shift char to get the first JIS byte.
  317. * Make sure first JIS byte is in proper
  318. * range to avoid indexing outside of table.
  319. */
  320. if ((jchar[1] = readchar (NULL)) == 0)
  321. return (jstate = JS_ETX);
  322. readcount++;
  323. if (jchar[1] < 0xA1)
  324. return (jstate = JS_DISCARD);
  325. if (jchar[1] > 0xAA)
  326. jstate = JS_KANJI;
  327. else
  328. jstate = jstates_set3 [(*jchar & 0x7F) - 32];
  329. if ((jchar[2] = readchar (NULL)) == 0)
  330. return (jstate = JS_ETX);
  331. readcount++;
  332. /* JS_ALPHA chars ('miscellaneous alphabetic chars' of
  333. * rows 9 - 11) are compatible with several other jstates,
  334. * so adjust as necessary.
  335. */
  336. if (jstate == JS_ALPHA &&
  337. ((last_jstate & JS_ALPHA_COMPATIBLE) != 0))
  338. jstate = last_jstate;
  339. else if (last_jstate == JS_ALPHA &&
  340. ((jstate & JS_ALPHA_COMPATIBLE) != 0))
  341. last_jstate = jstate;
  342. return jstate;
  343. }
  344. /* Set 2 (half-width katakana) */
  345. if (jchar[0] == SS2_CHAR) {
  346. jcharlen = 2;
  347. jstate = JS_HALFKATA;
  348. if ((jchar[1] = readchar (NULL)))
  349. readcount++;
  350. else
  351. jstate = JS_ETX;
  352. return jstate;
  353. }
  354. /* If first jchar doesn't match expected EUC coding,
  355. * discard it until we get back into sync.
  356. */
  357. jcharlen = 1;
  358. return (jstate = JS_DISCARD);
  359. } /* read_jchar() */
  360. /************************************************/
  361. /* */
  362. /* kanji_compounder */
  363. /* */
  364. /************************************************/
  365. /* Subroutine of parse_substring() of jpn_parser().
  366. * Used only for language DtSrLaJPN (power series compounding).
  367. * Called repeatedly when the substring is a sequence of kanji chars.
  368. * For each call writes to outbuf and returns a single kanji
  369. * compound word, using every possible compound in the substring
  370. * from length 1 to length MAX_KANJI_CLEN.
  371. * Updates offsetp for each word returned.
  372. * Returns NULL when substring exhausted. First call for
  373. * a new substring indicated by global is_new_substring.
  374. */
  375. static UCHAR *kanji_compounder (void)
  376. {
  377. static int all_done = TRUE;
  378. static int clen = MAX_KANJI_CLEN + 1;
  379. static UCHAR *mysubstrp = NULL;
  380. static UCHAR *mysubstrend = NULL;
  381. static UCHAR *op, *ss;
  382. static int i;
  383. if (is_new_substring) {
  384. is_new_substring = FALSE;
  385. all_done = FALSE;
  386. clen = 1;
  387. mysubstrp = substrbuf;
  388. mysubstrend = substrbuf + strlen ((char*)substrbuf);
  389. }
  390. /* Advance compound length by 1.
  391. * If max compound length exceeded, reset it
  392. * to 1 and increment substring pointer by 1 jchar.
  393. */
  394. else {
  395. if (all_done)
  396. return NULL;
  397. if (++clen > MAX_KANJI_CLEN) {
  398. clen = 1;
  399. mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
  400. }
  401. }
  402. /* Assemble one word into outbuf, of length clen,
  403. * beginning at current substring ptr.
  404. * If there aren't enough jchars left in string,
  405. * reset clen to 1 and advance substrp by 1 jchar.
  406. * We're all done when substring exhausted.
  407. */
  408. while (mysubstrp < mysubstrend) {
  409. op = outbuf;
  410. ss = mysubstrp;
  411. for (i = 0; i < clen; i++) {
  412. /* Are there enough jchars left in substring? */
  413. if (ss >= mysubstrend) {
  414. clen = 1;
  415. mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
  416. i = 0; /* indicates assembly failure */
  417. break; /* breaks the for loop */
  418. }
  419. /* Assemble one jchar into outbuf */
  420. if (*ss == SS3_CHAR)
  421. *op++ = *ss++;
  422. *op++ = *ss++;
  423. *op++ = *ss++;
  424. }
  425. /* Did word assembly succeed? */
  426. if (i >= clen) {
  427. *op = 0;
  428. if (offsetp)
  429. *offsetp = substr_offset + (mysubstrp - substrbuf);
  430. if (debugging_jpn)
  431. fprintf (aa_stderr,
  432. "knjcompdr: subofs=%2ld totofs=%3ld \"%s\"\n",
  433. (long) (mysubstrp - substrbuf), *offsetp, outbuf);
  434. return outbuf;
  435. }
  436. }
  437. all_done = TRUE;
  438. return NULL;
  439. } /* kanji_compounder() */
  440. /************************************************/
  441. /* */
  442. /* search_kanjitree */
  443. /* */
  444. /************************************************/
  445. /* Subroutine of parse_substring() of jpn_parser().
  446. * Used only for language DtSrLaJPN2; DtSrLaJPN calls
  447. * kanji_compounder() to generate compounds algorithmically.
  448. * First call for a new substring of kanjis is indicated
  449. * when is_new_substring is TRUE. Each call, then and thereafter,
  450. * returns a token (1) for each individual kanji char in string,
  451. * and (2) for each sequence of kanjis found in the kanji
  452. * compounds JPNTREE which begins with each char in string.
  453. * Also returns offset of returned token in offsetp.
  454. * Returns NULL when string is exhausted.
  455. * Variables are static for speeeeed.
  456. */
  457. static UCHAR *search_kanjitree (void)
  458. {
  459. static int all_done = TRUE;
  460. static JPNTREE *node, *last_node;
  461. static UCHAR *substrp, *substrend;
  462. static int direction;
  463. static int jcharlen;
  464. if (is_new_substring) {
  465. is_new_substring = FALSE;
  466. all_done = FALSE;
  467. substrend = substrbuf + strlen ((char*)substrbuf);
  468. substrp = substrbuf;
  469. /* Return first substr jchar as next token */
  470. last_node = NULL; /* NULL = tree not searched yet */
  471. jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
  472. strncpy ((char*)outbuf, (char*)substrp, jcharlen);
  473. outbuf [jcharlen] = 0;
  474. if (offsetp)
  475. *offsetp = substr_offset;
  476. return outbuf;
  477. }
  478. else if (all_done)
  479. return NULL;
  480. /* If not enough chars left in substring to search tree,
  481. * treat it as an exhausted tree search. In other words,
  482. * reset tree search, increment to next jchar, and return it.
  483. */
  484. if (strlen ((char*)substrp) < 4) {
  485. if (debugging_jpn)
  486. fputs ("knjtree: ...remaining substring too short", aa_stderr);
  487. EXHAUSTED_TREE:
  488. if (debugging_jpn)
  489. fputs (".\n", aa_stderr);
  490. last_node = NULL;
  491. substrp += jcharlen;
  492. if (substrp >= substrend) {
  493. all_done = TRUE;
  494. return NULL;
  495. }
  496. jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
  497. strncpy ((char*)outbuf, (char*)substrp, jcharlen);
  498. outbuf [jcharlen] = 0;
  499. if (offsetp)
  500. *offsetp = substr_offset + (substrp - substrbuf);
  501. return outbuf;
  502. }
  503. /* If last call resulted in a tree hit, the node was saved.
  504. * Continue the linked list search directly from the last hit.
  505. */
  506. if (last_node) {
  507. last_node = last_node->next;
  508. if (debugging_jpn)
  509. fputs ("knjtree: ...continue tree search: ", aa_stderr);
  510. LINKED_LIST_SEARCH:
  511. for (node = last_node; node; node = node->next) {
  512. if ((strncmp ((char*)substrp, node->word, node->len)) == 0) {
  513. /* HIT on linked list search */
  514. last_node = node;
  515. strcpy ((char*)outbuf, node->word);
  516. if (debugging_jpn)
  517. fprintf (aa_stderr, "* '%s'\n", outbuf);
  518. if (offsetp)
  519. *offsetp = substr_offset + (substrp - substrbuf);
  520. return outbuf;
  521. }
  522. else if (debugging_jpn)
  523. fputc ('-', aa_stderr);
  524. }
  525. goto EXHAUSTED_TREE;
  526. }
  527. /* Start new binary tree search at curr jchar.
  528. * If hit, commence linked list search.
  529. */
  530. if (debugging_jpn)
  531. fprintf (aa_stderr,
  532. "knjtree: \"%.4s...\" ", substrp);
  533. for (node = kanjitree; node != NULL; ) {
  534. if ((direction = strncmp ((char*)substrp, node->word, 4)) == 0) {
  535. /* HIT on binary search */
  536. last_node = node;
  537. goto LINKED_LIST_SEARCH;
  538. }
  539. /* Descend left or right depending on word */
  540. if (debugging_jpn)
  541. fputc ((direction < 0) ? 'L' : 'R', aa_stderr);
  542. if (direction < 0)
  543. node = node->llink;
  544. else
  545. node = node->rlink;
  546. }
  547. /* No match on first 4 bytes of substrp in binary tree.
  548. * Tree exhausted without a hit, so increment to next
  549. * jchar in substring and return it as a word.
  550. */
  551. goto EXHAUSTED_TREE;
  552. } /* search_kanjitree() */
  553. /************************************************/
  554. /* */
  555. /* parse_substring */
  556. /* */
  557. /************************************************/
  558. /* Subroutine of jpn_parser().
  559. * Returns next Japanese multibyte word token from current
  560. * substring of jchars, or NULL when out of tokens.
  561. * Returned token is valid until next call.
  562. * Static args initialized at first call for a new substring.
  563. * Provides optional kanji compounding depending on PA_ flags.
  564. * We usually compound at index time (dtsrindex) or when query
  565. * is Query-By-Example (statistical searches), and usually don't
  566. * compound boolean queries.
  567. */
  568. static UCHAR *parse_substring (void)
  569. {
  570. static int is_substr_end = TRUE;
  571. static int substrlen = 0;
  572. static PARG myparg;
  573. static UCHAR *token;
  574. static long myoffset;
  575. if (is_new_substring) {
  576. substrlen = strlen ((char*)substrbuf);
  577. /* A very common ascii substring is the final line-feed
  578. * at the end of a line of text--discard it now.
  579. */
  580. if (last_jstate == JS_ASCII
  581. && substrlen == 1
  582. && substrbuf[0] == '\n') {
  583. is_substr_end = TRUE;
  584. is_new_substring = FALSE;
  585. return NULL;
  586. }
  587. is_substr_end = FALSE;
  588. if (!outbuf)
  589. outbuf = austext_malloc (DtSrMAXWIDTH_HWORD + 8,
  590. PROGNAME"807", NULL);
  591. if (debugging_jpn) {
  592. int i;
  593. fprintf (aa_stderr, "jpnsubstr: js=%s len=%ld str='",
  594. display_jstate(last_jstate), (long) substrlen);
  595. for (i = 0; i < substrlen; i++)
  596. fputc ((substrbuf[i] < 32)? '~' : substrbuf[i],
  597. aa_stderr);
  598. fprintf (aa_stderr, "'\n");
  599. if (last_jstate == JS_ROMAN) {
  600. fprintf (aa_stderr, " (ascii equiv: '");
  601. for (i = 1; i < substrlen; i+=2)
  602. fputc ((substrbuf[i] & 0x7f) + 32, aa_stderr);
  603. fprintf (aa_stderr, "')\n");
  604. }
  605. fflush (aa_stderr);
  606. }
  607. } /* endif is_new_substring */
  608. if (is_substr_end)
  609. return NULL;
  610. switch (last_jstate) {
  611. case JS_DISCARD:
  612. /* Ignore discardable substrings */
  613. is_new_substring = FALSE;
  614. is_substr_end = TRUE;
  615. return NULL;
  616. case JS_KATAKANA:
  617. case JS_ROMAN:
  618. case JS_CYRILLIC:
  619. case JS_GREEK:
  620. case JS_ALPHA:
  621. case JS_HALFKATA:
  622. /* Treat entire substring as single parsed word */
  623. ENTIRE_SUBSTR_IS_WORD:
  624. if (debugging_jpn)
  625. fputs (" token is entire substring.\n", aa_stderr);
  626. strncpy ((char*)outbuf, (char*)substrbuf, DtSrMAXWIDTH_HWORD);
  627. outbuf [DtSrMAXWIDTH_HWORD - 1] = 0;
  628. is_new_substring = FALSE;
  629. is_substr_end = TRUE;
  630. if (offsetp)
  631. *offsetp = substr_offset;
  632. return outbuf;
  633. case JS_ASCII:
  634. /* Call the full teskey_parser() for European languages.
  635. * Includes stoplist and include list processing.
  636. */
  637. if (is_new_substring) {
  638. is_new_substring = FALSE;
  639. if (debugging_jpn)
  640. fputs (" calling teskey parser.\n", aa_stderr);
  641. myparg.dblk = jpn_dblk;
  642. myparg.string = substrbuf;
  643. myparg.ftext = NULL;
  644. myparg.offsetp = &myoffset;
  645. token = (UCHAR *) teskey_parser (&myparg);
  646. }
  647. else
  648. token = (UCHAR *) teskey_parser (NULL);
  649. if (token) {
  650. if (offsetp)
  651. *offsetp = substr_offset + myoffset;
  652. }
  653. else
  654. is_substr_end = TRUE;
  655. return token;
  656. case JS_KANJI:
  657. /* If not compounding, treat entire substring
  658. * as one query word, ie a single compound kanji word.
  659. * If compounding, each individual kanji in the
  660. * substring is returned as a word by itself.
  661. * Each kanji can be 2 or 3 bytes depending on
  662. * which code set it came from. In addition,
  663. * sequences of 2 or more kanjis ('compound kanji
  664. * words') are returned as individual words.
  665. * Method of kanji compounding depends on language:
  666. * DtSrLaJPN does "power series" kanji compounding,
  667. * DtSrLaJPN2 looks up kanji compounds in a word tree.
  668. * Both functions test and reset is_new_substring,
  669. * update offsetp as necessary, and return either NULL
  670. * or a pointer to outbuf containing a valid token.
  671. */
  672. if (!do_compounding)
  673. goto ENTIRE_SUBSTR_IS_WORD;
  674. token = (language == DtSrLaJPN)?
  675. kanji_compounder() : search_kanjitree();
  676. if (!token)
  677. is_substr_end = TRUE;
  678. return token;
  679. default:
  680. break;
  681. } /* end state switch */
  682. /* Should never get here... */
  683. fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 20,
  684. "%s Program Error: Unknown jstate %d.\n") ,
  685. PROGNAME"246", last_jstate);
  686. DtSearchExit (46);
  687. } /* parse_substring() */
  688. /************************************************/
  689. /* */
  690. /* jpn_parser */
  691. /* */
  692. /************************************************/
  693. /* Returns next word token from text stream of packed EUC
  694. * Japanese text, languages DtSrLaJPN and DtSrLaJPN2.
  695. * Called from (1) dtsrindex, where readchar_ftext() cofunction
  696. * reads the .fzk file document 'stream', or (2) search engine
  697. * query parsers, where readchar_string() cofunction 'reads'
  698. * from the query string.
  699. *
  700. * First call passes args in PARG block. This resets end of
  701. * text block (ETX) flag, resets 'offset' counter to zero, etc.
  702. * Subsequent calls should pass NULL, and parser returns
  703. * next token in block, until reader cofunction reads ETX
  704. * end returns special ETX char ('\0'). Subsequent call to parser
  705. * returns NULL meaning "no tokens left in current stream".
  706. * Reader cofunction tolerates repeated calls after
  707. * the first ETX, still returning '\0'.
  708. *
  709. * This parser presumes all incoming text is packed EUC multibyte
  710. * Japanese chars as described above, but is otherwise unformatted.
  711. * Since parser accesses streams a multibyte char at a time,
  712. * it does not require periodic line feeds, etc.
  713. *
  714. * To control kanji compounding, caller should set a PA_ switch
  715. * in parg.flags as desired before call. Compounding is done
  716. * when indexing (dtsrindex) or for hiliting (comparing previous
  717. * search results against all possible words in document text).
  718. * But in a Query by Example (stat searches), parser might also
  719. * be asked to generate compound words. In boolean queries
  720. * (stems and exact words), parser should not generate compounds
  721. * because if user enters a compound string, he probably only wants
  722. * documents containing that exact token.
  723. *
  724. * Parser also returns offset information: number of bytes
  725. * since beginning of text block. The returned offsets are
  726. * NOT NECESSARILY IN ASCENDING ORDER due to kanji compounding.
  727. *
  728. * Variables are static or global for speeeeeeed.
  729. *
  730. * OUTPUT FORMAT: NULL or a static C string containing a
  731. * single parsed word token.
  732. * The text in the buffer is valid until the next call.
  733. * Each word is translated as described above.
  734. */
  735. char *jpn_parser (PARG *parg)
  736. {
  737. static int filling_substring = TRUE;
  738. static int was_discarding = FALSE;
  739. static int add_msgs = FALSE;
  740. static UCHAR *endsubstrbuf = NULL;
  741. static size_t substrbufsz = 0;
  742. static UCHAR *token;
  743. static UCHAR *substrp;
  744. /* If first call for new text block... */
  745. if (parg) {
  746. jpn_dblk = parg->dblk;
  747. language = jpn_dblk->dbrec.or_language;
  748. kanjitree = ((JPNBLK *)(jpn_dblk->parse_extra))->kanjitree;
  749. offsetp = parg->offsetp;
  750. do_compounding = (parg->flags & (PA_HILITING | PA_INDEXING));
  751. add_msgs = (parg->flags & PA_MSGS);
  752. if (parg->string) { /* text is query str from search engine */
  753. save_parg_string = parg->string;
  754. readchar_arg = parg->string;
  755. readchar = (READCFP) readchar_string;
  756. }
  757. else { /* text is from .fzk file in dtsrindex */
  758. save_parg_string = NULL;
  759. readchar_arg = parg;
  760. readchar = (READCFP) readchar_ftext;
  761. }
  762. if (substrbufsz == 0) {
  763. substrbufsz = SUBSTRBUFSZ;
  764. substrbuf = austext_malloc (SUBSTRBUFSZ + 8, PROGNAME"680", NULL);
  765. }
  766. endsubstrbuf = substrbuf + substrbufsz;
  767. if (debugging_jpn) {
  768. fprintf (aa_stderr,
  769. "jpnparser: start text block, substrbufsz=%ld.\n",
  770. (long) substrbufsz);
  771. fflush (aa_stderr);
  772. }
  773. /* Seed the first substring */
  774. filling_substring = TRUE;
  775. readcount = 0L;
  776. last_jstate = JS_STX;
  777. read_jchar();
  778. } /* endif (parg != NULL) */
  779. FILL_ANOTHER_SUBSTRING:
  780. /* Input text is presumed to contain substrings
  781. * of chars related by their EUC encoding.
  782. * Fill the substring buffer by reading in nonDISCARDable
  783. * multibyte jchars until jstate changes signaling
  784. * end of a substring.
  785. * Note last jchar read, the one that changes the jstate,
  786. * hangs around till we come back to this loop.
  787. */
  788. if (filling_substring) {
  789. if (debugging_jpn) {
  790. if (jstate == JS_DISCARD) {
  791. fputs ("jpnparser: js=DISCARD:", aa_stderr);
  792. was_discarding = TRUE;
  793. }
  794. else
  795. was_discarding = FALSE;
  796. }
  797. while (jstate == JS_DISCARD) {
  798. if (debugging_jpn)
  799. fprintf (aa_stderr, " %s", jchar);
  800. read_jchar();
  801. }
  802. if (debugging_jpn && was_discarding)
  803. fputc ('\n', aa_stderr);
  804. if (jstate == JS_ETX) {
  805. if (debugging_jpn)
  806. fputs ("jpnparser: js=ETX\n", aa_stderr);
  807. if (add_msgs) {
  808. char msgbuf [DtSrMAXWIDTH_HWORD + 100];
  809. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 21,
  810. "%s '%.*s' is not a valid Japanese word.") ,
  811. PROGNAME"812", DtSrMAXWIDTH_HWORD, save_parg_string);
  812. DtSearchAddMessage (msgbuf);
  813. }
  814. return NULL;
  815. }
  816. last_jstate = jstate;
  817. substrp = substrbuf;
  818. substr_offset = readcount - jcharlen;
  819. /* Fill the substring buffer.
  820. * Ensure substring buffer is big enough.
  821. */
  822. while (last_jstate == jstate) {
  823. if (endsubstrbuf - substrp < 8) {
  824. size_t curlen = substrp - substrbuf;
  825. if (debugging_jpn) {
  826. fprintf (aa_stderr,
  827. "jpnparser: curr substr len %lu, "
  828. "new substrbufsz %lu.\n",
  829. (unsigned long) curlen, (unsigned long) substrbufsz<<1);
  830. fflush (aa_stderr);
  831. }
  832. substrbufsz <<= 1; /* double its size */
  833. substrbuf = realloc (substrbuf, substrbufsz);
  834. endsubstrbuf = substrbuf + substrbufsz;
  835. substrp = substrbuf + curlen;
  836. }
  837. strncpy ((char*)substrp, (char*)jchar, jcharlen);
  838. substrp += jcharlen;
  839. read_jchar();
  840. }
  841. *substrp = 0;
  842. filling_substring = FALSE;
  843. is_new_substring = TRUE;
  844. }
  845. /* Empty the substring buffer returning each token
  846. * one by one, ie parse and return word tokens from string,
  847. * including possible kanji compounds if switched on.
  848. */
  849. if ((token = parse_substring()))
  850. return (char *) token;
  851. /* When current substring is empty, go back and fill another one.
  852. * If we're parsing a string (eg hiliting text of a doc),
  853. * parse_substring() will have used readchar_string().
  854. * Since we now want to resume using it to parse the original
  855. * string, we have to reset it's string ptr.
  856. */
  857. filling_substring = TRUE;
  858. if (save_parg_string)
  859. readchar_arg = save_parg_string + readcount;
  860. goto FILL_ANOTHER_SUBSTRING;
  861. } /* jpn_parser() */
  862. /************************************************/
  863. /* */
  864. /* load_jpntree */
  865. /* */
  866. /************************************************/
  867. /* Subroutine of load_jpn_language. Builds a JPNTREE
  868. * from a file of packed EUC compound words.
  869. * Basically a copy of load_wordtree() in lang.c.
  870. *
  871. * INPUT FILE FORMAT: One word per line, min 4 bytes (2 jchars),
  872. * all words packed EUC. Preferred order is frequency of
  873. * occurrence in the corpus to make searches efficient.
  874. * Otherwise the words should at least be in random order or
  875. * an order that will approximate a binary search.
  876. * If first char is ASCII (ie not packed EUC), line is
  877. * ignored as comments. Any ascii chars after packed EUC,
  878. * such as whitespace and/or subsequent ascii comments,
  879. * delimits word token (ie anything else on the line is ignored).
  880. * "Line" ends in ascii linefeed (\n).
  881. *
  882. * RETURNS 0 if file successfully loaded, returns 1 if file missing,
  883. * returns 2 and messages in global msglist if file has fatal errors.
  884. */
  885. static int load_jpntree (
  886. JPNTREE **treetop,
  887. char *fname)
  888. {
  889. int i;
  890. int comment_count = 0;
  891. int node_count = 0;
  892. int is_duplicate;
  893. long linecount = 0;
  894. UCHAR *cptr;
  895. UCHAR readbuf [256];
  896. char sprintbuf [_POSIX_PATH_MAX + 1024];
  897. FILE *fileid;
  898. JPNTREE *new;
  899. JPNTREE **this_link;
  900. if (debugging_loadlang | debugging_loadword)
  901. fprintf (aa_stderr, PROGNAME"1071 "
  902. "load_jpntree: fname='%s'\n", NULLORSTR(fname));
  903. if ((fileid = fopen (fname, "rt")) == NULL) {
  904. /* Not being able to find the file is not an error.
  905. * We indicate that with the return code.
  906. * But any other error (like permissions) is fatal.
  907. */
  908. if (errno == ENOENT) {
  909. if (debugging_loadlang | debugging_loadword)
  910. fputs (" ...file not found.\n", aa_stderr);
  911. return 1;
  912. }
  913. else {
  914. sprintf (sprintbuf,
  915. CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
  916. PROGNAME"362", fname, strerror(errno));
  917. DtSearchAddMessage (sprintbuf);
  918. return 2;
  919. }
  920. }
  921. /*--------- Main Read Loop ----------*/
  922. while (fgets ((char*)readbuf, sizeof(readbuf), fileid) != NULL) {
  923. linecount++;
  924. /*
  925. * Ignore lines beginning with any ascii char (comments).
  926. * Otherwise first or only packed EUC token on line
  927. * is the desired word.
  928. */
  929. if (readbuf[0] < 0x80) {
  930. comment_count++;
  931. continue;
  932. }
  933. for (cptr = readbuf; *cptr >= 0x80; cptr++)
  934. ;
  935. *cptr = 0;
  936. if (debugging_loadword) {
  937. fprintf (aa_stderr, " JPNWORD: '%s' %n", readbuf, &i);
  938. while (i++ < 28)
  939. fputc (' ', aa_stderr);
  940. }
  941. /* Test for word too short */
  942. if (strlen((char*)readbuf) < 4) {
  943. sprintf (sprintbuf, CATGETS(dtsearch_catd, MS_lang, 23,
  944. "%s Word '%s' on line %ld is too short.") ,
  945. PROGNAME"1074", readbuf, linecount);
  946. DtSearchAddMessage (sprintbuf);
  947. continue;
  948. }
  949. /* Allocate and populate a new node */
  950. i = strlen ((char*) readbuf);
  951. new = austext_malloc (sizeof(JPNTREE) + i + 4,
  952. PROGNAME"104", NULL);
  953. new->llink = NULL;
  954. new->rlink = NULL;
  955. new->next = NULL;
  956. new->len = i;
  957. new->word = (void *) (new + 1);
  958. strcpy (new->word, (char *) readbuf);
  959. /* Search binary tree, comparing only first 4 bytes */
  960. is_duplicate = FALSE;
  961. for (this_link = treetop; *this_link != NULL; ) {
  962. i = strncmp (new->word, (*this_link)->word, 4);
  963. if (i == 0) {
  964. /* If first 4 bytes are similar, search
  965. * linked list, comparing entire string.
  966. */
  967. while (*this_link != NULL) {
  968. i = strcmp (new->word, (*this_link)->word);
  969. /* Test for duplicate word */
  970. if (i == 0) {
  971. sprintf (sprintbuf,
  972. CATGETS(dtsearch_catd, MS_misc, 423,
  973. "%s Word '%s' in '%s' is a duplicate."),
  974. PROGNAME"423", readbuf, fname);
  975. DtSearchAddMessage (sprintbuf);
  976. /* duplicates aren't fatal, just ignore the word */
  977. is_duplicate = TRUE;
  978. break; /* discontinue list search */
  979. }
  980. if (debugging_loadword)
  981. fputc('-', aa_stderr);
  982. this_link = &(*this_link)->next;
  983. } /* end linked list search */
  984. break; /* discontinue tree search */
  985. } /* endif where first 4 bytes matched at a tree node */
  986. /* First 4 bytes dissimilar. Descend tree
  987. * to find next possible insertion point.
  988. */
  989. if (debugging_loadword)
  990. fputc(((i < 0)? 'L' : 'R'), aa_stderr);
  991. this_link = (JPNTREE **) ((i < 0) ?
  992. &(*this_link)->llink : &(*this_link)->rlink);
  993. } /* end binary tree search */
  994. /* Don't link anything if error found while descending tree */
  995. if (is_duplicate) {
  996. if (debugging_loadword)
  997. fputs (" duplicate!\n", aa_stderr);
  998. free (new);
  999. continue;
  1000. }
  1001. /* Insert new node at current location in tree */
  1002. *this_link = new;
  1003. if (debugging_loadword)
  1004. fputs(".\n", aa_stderr);
  1005. node_count++;
  1006. } /* end of read loop */
  1007. fclose (fileid);
  1008. if (node_count <= 0) {
  1009. if (debugging_loadlang | debugging_loadword)
  1010. fprintf (aa_stderr,
  1011. PROGNAME"1185 load '%s' unsuccessful, %d comments discarded.\n",
  1012. fname, comment_count);
  1013. sprintf (sprintbuf, CATGETS(dtsearch_catd, MS_lang, 24,
  1014. "%s No Japanese words in word file '%s'.") ,
  1015. PROGNAME"1186", fname);
  1016. DtSearchAddMessage (sprintbuf);
  1017. return 2;
  1018. }
  1019. else {
  1020. if (debugging_loadlang | debugging_loadword)
  1021. fprintf (aa_stderr,
  1022. PROGNAME"1193 load word file '%s' successful, %d words.\n",
  1023. fname, node_count);
  1024. return 0;
  1025. }
  1026. } /* load_jpntree() */
  1027. /************************************************/
  1028. /* */
  1029. /* load_jpn_language */
  1030. /* */
  1031. /************************************************/
  1032. /* Loads a dblk with japanese (DtSrLaJPN, DtSrLaJPN2)
  1033. * structures and function pointers.
  1034. * Called from load_language(), with identical input and output.
  1035. * Does not reload structures previously loaded in
  1036. * other jpn dblks on dblist if derived from identical files.
  1037. * But always loads structures if passed dblist is NULL.
  1038. * Presumes dblk already partially initialized:
  1039. * name, path, language, flags.
  1040. * Returns TRUE if all successful. Otherwise
  1041. * returns FALSE with err msgs on ausapi_msglist.
  1042. */
  1043. int load_jpn_language (DBLK *dblk, DBLK *dblist)
  1044. {
  1045. extern int ascii_charmap[]; /* in lang.c */
  1046. int i;
  1047. int errcount = 0;
  1048. JPNBLK *jpnblk;
  1049. char fname [_POSIX_PATH_MAX + 4];
  1050. char path [_POSIX_PATH_MAX + 4];
  1051. char msgbuf [_POSIX_PATH_MAX + 128];
  1052. dblk->charmap = ascii_charmap; /* for teskey */
  1053. dblk->parser = jpn_parser;
  1054. dblk->lstrupr = null_lstrupr;
  1055. dblk->stemmer = null_stemmer;
  1056. if (dblk->dbrec.or_maxwordsz == 0) /* for teskey */
  1057. dblk->dbrec.or_maxwordsz = MAXWIDTH_SWORD - 1;
  1058. if (dblk->dbrec.or_minwordsz == 0) /* for teskey */
  1059. dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
  1060. jpnblk = austext_malloc (sizeof(JPNBLK) + 4, PROGNAME"2107", NULL);
  1061. memset (jpnblk, 0, sizeof(JPNBLK));
  1062. dblk->parse_extra = (void *) jpnblk;
  1063. /* Load optional katakana and kanji word lists.
  1064. * If specific dblk version not found,
  1065. * try the default language version.
  1066. * If either has load errors, return a failure.
  1067. * If both are missing, just forget it.
  1068. */
  1069. if (dblk->path == NULL)
  1070. path[0] = 0;
  1071. else {
  1072. if (strlen (dblk->path) > _POSIX_PATH_MAX - 14) {
  1073. sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 25,
  1074. "%s Database '%s' path too long: '%s'.") ,
  1075. PROGNAME"759", dblk->name, dblk->path);
  1076. DtSearchAddMessage (msgbuf);
  1077. return FALSE;
  1078. }
  1079. strcpy (path, dblk->path);
  1080. ensure_end_slash (path);
  1081. }
  1082. #ifdef NO_KATAKANA_TREES_YET
  1083. /* Load katakana wordtree */
  1084. strcpy (fname, path);
  1085. strcat (fname, dblk->name);
  1086. strcat (fname, EXT_KATAKANA);
  1087. i = load_jpntree (&jpnblk->katatree, fname);
  1088. if (i == 1) { /* ...db specific file not found */
  1089. if (jpn_katatree == NULL) { /* load default... */
  1090. strcpy (fname, path);
  1091. strcat (fname, "jpn");
  1092. strcat (fname, EXT_KATAKANA);
  1093. i = load_jpntree (&jpn_katatree, fname);
  1094. }
  1095. else /* default already loaded */
  1096. i == 0;
  1097. jpnblk->katatree = jpn_katatree;
  1098. }
  1099. if (i > 1)
  1100. errcount++;
  1101. #endif /* NO_KATAKANA_TREES_YET */
  1102. /* Load kanji wordtree only if kanji compounds are derived
  1103. * from list in file, ie for language DtSrLaJPN2 only.
  1104. * If database specific list not found,
  1105. * use language generic list. If language generic
  1106. * list also not found, ignore compounding.
  1107. * Only one language generic list will
  1108. * be loaded, at jpn_kanjitree.
  1109. */
  1110. if (dblk->dbrec.or_language == DtSrLaJPN2) {
  1111. strcpy (fname, path);
  1112. strcat (fname, dblk->name);
  1113. strcat (fname, EXT_KANJI);
  1114. i = load_jpntree (&jpnblk->kanjitree, fname);
  1115. if (i == 1) { /* ...db specific file not found */
  1116. /* If the generic knj file (jpn.knj) was
  1117. * never loaded, try loading it now.
  1118. */
  1119. if (jpn_kanjitree == NULL) {
  1120. strcpy (fname, path);
  1121. strcat (fname, "jpn");
  1122. strcat (fname, EXT_KANJI);
  1123. load_jpntree (&jpn_kanjitree, fname);
  1124. /* (it either worked or it didn't) */
  1125. }
  1126. /* Whether generic load successful or not,
  1127. * try to use it (eg it might still be NULL).
  1128. */
  1129. jpnblk->kanjitree = jpn_kanjitree;
  1130. }
  1131. if (i > 1) /* error trying to open db specific file */
  1132. errcount++;
  1133. }
  1134. return (errcount > 0)? FALSE : TRUE;
  1135. } /* load_jpn_language() */
  1136. /************************************************/
  1137. /* */
  1138. /* free_jpntree */
  1139. /* */
  1140. /************************************************/
  1141. /* Identical to free_wordtree() in lang.c
  1142. * (link inversion traversal, from Data Structure Techniques,
  1143. * Thomas A. Standish, Algorithm 3.6),
  1144. * except post order visit includes freeing
  1145. * linked list at each tree node.
  1146. */
  1147. static void free_jpntree (JPNTREE ** jpntree_head)
  1148. {
  1149. JPNTREE *next, *prev, *pres;
  1150. JPNTREE *listp, *next_listp;
  1151. if (*jpntree_head == NULL)
  1152. return;
  1153. pres = *jpntree_head;
  1154. prev = NULL;
  1155. DESCEND_LEFT:
  1156. pres->word = (void *) 0; /* preorder visit: TAG = 0 */
  1157. next = pres->llink;
  1158. if (next != NULL) {
  1159. pres->llink = prev;
  1160. prev = pres;
  1161. pres = next;
  1162. goto DESCEND_LEFT;
  1163. }
  1164. DESCEND_RIGHT:
  1165. next = pres->rlink;
  1166. if (next != NULL) {
  1167. pres->word = (void *) 1; /* TAG = 1 */
  1168. pres->rlink = prev;
  1169. prev = pres;
  1170. pres = next;
  1171. goto DESCEND_LEFT;
  1172. }
  1173. POSTORDER_VISIT:
  1174. listp = pres;
  1175. while (listp->next) {
  1176. next_listp = listp->next;
  1177. free (listp);
  1178. listp = next_listp;
  1179. }
  1180. free (listp);
  1181. if (prev == NULL) { /* end of algorithm? */
  1182. *jpntree_head = NULL;
  1183. return;
  1184. }
  1185. if (prev->word == (void *) 0) { /* go up left leg */
  1186. next = prev->llink;
  1187. pres = prev;
  1188. prev = next;
  1189. goto DESCEND_RIGHT;
  1190. }
  1191. else { /* go up right leg */
  1192. next = prev->rlink;
  1193. prev->word = (void *) 0; /* restore TAG = 0 */
  1194. pres = prev;
  1195. prev = next;
  1196. goto POSTORDER_VISIT;
  1197. }
  1198. } /* free_jpntree() */
  1199. /************************************************/
  1200. /* */
  1201. /* unload_jpn_language */
  1202. /* */
  1203. /************************************************/
  1204. /* Frees storage for structures allocated by load_jpn_language().
  1205. * Called when engine REINITs due to change in site config file
  1206. * or databases.
  1207. * The global jpntrees are not currently unloaded because they
  1208. * are presumed valid for the duration of the engine session.
  1209. * Currently there are no teskey trees (inclist, stoplist) to free.
  1210. */
  1211. void unload_jpn_language (DBLK *dblk)
  1212. {
  1213. /* free jpnblk and any database-associated jpntrees */
  1214. if (dblk->parse_extra) {
  1215. JPNBLK *jpnblk = (JPNBLK *) dblk->parse_extra;
  1216. if (jpnblk->katatree && jpnblk->katatree != jpn_katatree)
  1217. free_jpntree (&jpnblk->katatree);
  1218. if (jpnblk->kanjitree && jpnblk->kanjitree != jpn_kanjitree)
  1219. free_jpntree (&jpnblk->kanjitree);
  1220. free (jpnblk);
  1221. dblk->parse_extra = NULL;
  1222. }
  1223. return;
  1224. } /* unload_jpn_language() */
  1225. /******************** JPN.C ********************/