dtsrhanf.sgm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. <!-- $XConsortium: dtsrhanf.sgm /main/6 1996/09/08 20:19:48 rws $ -->
  2. <!-- (c) Copyright 1996 Digital Equipment Corporation. -->
  3. <!-- (c) Copyright 1996 Hewlett-Packard Company. -->
  4. <!-- (c) Copyright 1996 International Business Machines Corp. -->
  5. <!-- (c) Copyright 1996 Sun Microsystems, Inc. -->
  6. <!-- (c) Copyright 1996 Novell, Inc. -->
  7. <!-- (c) Copyright 1996 FUJITSU LIMITED. -->
  8. <!-- (c) Copyright 1996 Hitachi. -->
  9. <![ %CDE.C.CDE; [<RefEntry Id="CDE.INFO.dtsrhanfile">]]>
  10. <RefMeta>
  11. <RefEntryTitle>dtsrhanfile</RefEntryTitle>
  12. <ManVolNum>special file</ManVolNum>
  13. </RefMeta>
  14. <RefNameDiv>
  15. <RefName>dtsrhanfile</RefName>
  16. <RefPurpose>
  17. Describes the format and syntax of DtSearch han files
  18. </RefPurpose>
  19. </RefNameDiv>
  20. <RefSynopsisDiv>
  21. <Synopsis>
  22. <Symbol Role="Variable">filename</Symbol>.han
  23. </Synopsis>
  24. </RefSynopsisDiv>
  25. <RefSect1>
  26. <Title>DESCRIPTION</Title>
  27. <Para>Han files are the user generated profile files for <command>dtsrhan</command>.
  28. They identify fields in incoming text from which output fzk
  29. file fields can be constructed. The data from han files
  30. are loaded into memory by dtsrhan at initialization time.
  31. <command>dtsrhan</command> and han files have not been internationalized;
  32. han files may only contain ASCII characters.
  33. </para>
  34. <refsect2>
  35. <Title>General Format</Title>
  36. <para>All identifiers must begin with a letter, and must be composed entirely
  37. of alphanumerics and/or the underscore.
  38. </para>
  39. <para>Observe the following points when using using "strings":
  40. </para>
  41. <itemizedlist>
  42. <listitem>
  43. <para>If an identifying string contains quotes, use a backslash
  44. to create the quote. Example:
  45. </para>
  46. <programlisting>
  47. this string \"contains\" quotes
  48. </programlisting>
  49. <para>would find the string <literal>this string "contains" quotes</literal>.
  50. </para>
  51. </listitem>
  52. <listitem>
  53. <para>The above point makes it necessary to use double backslashes to create
  54. a single backslash. Example:
  55. </para>
  56. <programlisting>
  57. this string has a \\ backslash
  58. </programlisting>
  59. <para>would find the string <literal>this string has a \ backslash</literal>.
  60. </para>
  61. </listitem>
  62. <listitem>
  63. <para>Actually, using the backslash in any string will cause the next
  64. character to be included without exception. Thus, a string
  65. with <literal>this is \a test</literal> will end up being
  66. <literal>this is a test</literal>.
  67. The backslash is ignored, and the next character is imbedded
  68. in the string. This is only needed in the two cases described
  69. above, but can be used for any purpose.
  70. </para>
  71. </listitem>
  72. </itemizedlist>
  73. </refsect2>
  74. <refsect2>
  75. <Title>Individual Line Syntax</Title>
  76. <variablelist>
  77. <varlistentry><term># ... | blank line</term>
  78. <listitem>
  79. <para>Han file comment. Any line beginning with a pound sign
  80. in the first column, or any blank line, is discarded.
  81. </para>
  82. </listitem>
  83. </varlistentry>
  84. <varlistentry><term>line <emphasis>identifier</emphasis> = <emphasis>physical_line_number</emphasis></term>
  85. <listitem>
  86. <para>Defines a <literal>line</literal> with a physical line number in the record.
  87. <emphasis>physical_line_number</emphasis> must be a number.
  88. </para>
  89. </listitem>
  90. </varlistentry>
  91. <varlistentry><term>line <emphasis>identifier</emphasis> = column_number,"<emphasis>string</emphasis>" [<emphasis>column_number</emphasis>,"<emphasis>string</emphasis>"] ...</term>
  92. <listitem>
  93. <para>Defines a <literal>line</literal> using a column number and a
  94. 'signature' string that should appear at that column.
  95. <emphasis>column_number</emphasis> can be a number, or
  96. <literal>*</literal> for 'any column'. "<emphasis>string</emphasis>"
  97. should be a string that occurs on the line in question. It is possible
  98. to define complex signatures using multiple clauses.
  99. </para>
  100. </listitem>
  101. </varlistentry>
  102. <varlistentry><term>field <emphasis>identifier</emphasis> = <emphasis>line_identifier</emphasis>,"<emphasis>string</emphasis>", <emphasis>offset</emphasis>, <emphasis>length</emphasis></term>
  103. <listitem>
  104. <para>Defines a <literal>field</literal> based on a declared line, a string
  105. found on that line, the offset from the first letter of the string, and
  106. the length of field.
  107. </para>
  108. <para><emphasis>line_identifier</emphasis> is an identifier declared with the
  109. <literal>line</literal> directive (see above).
  110. </para>
  111. <para>"<emphasis>string</emphasis>" is a string for relative positioning,
  112. where a field will follow a string that may not always occur in the
  113. same position on a line. If it is known that the field will always be
  114. in the same position, an empty string("") may be used.
  115. <emphasis>string</emphasis> must be enclosed in double quotes.
  116. <emphasis>offset</emphasis> must be a number, identifying the offset
  117. from the first character in the string. It starts at position 1, not 0,
  118. and may be negative.
  119. </para>
  120. <para><emphasis>length</emphasis> represents the length of the field. It may
  121. be a number, or it may be one of two special tokens:
  122. </para>
  123. <variablelist>
  124. <varlistentry><term><literal>eow</literal></term>
  125. <listitem>
  126. <para>End of word. The field will begin at <emphasis>offset</emphasis> and
  127. continue until the next white-space character.
  128. </para>
  129. </listitem>
  130. </varlistentry>
  131. <varlistentry><term><literal>eoln</literal></term>
  132. <listitem>
  133. <para>End of line. The field will begin at <emphasis>offset</emphasis> and
  134. continue to the end of the line.
  135. </para>
  136. </listitem>
  137. </varlistentry>
  138. </variablelist>
  139. <para>An identifier <emphasis>string</emphasis> beginning with 3 uppercase M's
  140. ("MMM...") will be considered an English month name string.
  141. At run time, if the first 3 chars of the field's value
  142. equal the first three chars of an English month name,
  143. the value string will be translated to a two character
  144. string of digits in the range "01" to "12".
  145. For example, if field <emphasis>MMMmymonth</emphasis> had an original value of
  146. "April ", it will be translated to "04" before use.
  147. </para>
  148. <para>In the case where a <literal>line</literal> identifier is associated with
  149. multiple lines in a single document, the field value will
  150. be determined from the last occurrence of the line within
  151. the record.
  152. </para>
  153. </listitem>
  154. </varlistentry>
  155. <varlistentry><term>constant <emphasis>identifier</emphasis> = "<emphasis>string</emphasis>"</term>
  156. <listitem>
  157. <para>Defines a <literal>constant</literal> field that can be used in
  158. abstracts and keys. The <emphasis>identifier</emphasis> is defined
  159. exactly the same as a <literal>field</literal> identifier. The value
  160. must be enclosed in double quotes.
  161. </para>
  162. </listitem>
  163. </varlistentry>
  164. <varlistentry><term>date = null | <emphasis>field_id</emphasis> [+ <emphasis>field_id</emphasis>] ...</term>
  165. <listitem>
  166. <para>Defines the document date for each document. It will
  167. be converted into a correctly formatted fzk file date line.
  168. </para>
  169. <para><literal>null</literal> specifies undated documents. Undated documents
  170. always qualify for searches irrespective of date
  171. qualifiers in <function>DtSearchQuery</function>.
  172. </para>
  173. <para><emphasis>field_id</emphasis> is an identifier declared using the <literal>field</literal>
  174. or <literal>constant</literal> directives (see above).
  175. "MMM" fields are often useful for date assemblies.
  176. </para>
  177. <para>Multiple fields may be concatenated into a date.
  178. </para>
  179. <para>After concatenation, the assembled date must be of the following format:
  180. <emphasis>YYYYMMDDhhmm</emphasis> (exactly 12 digits). For example,
  181. <literal>199404171701</literal> is April 17, 1994 at 5:01 pm.
  182. <literal>200405031000</literal> is May 3, 2004, at 10:00 am (10
  183. o'oclock).
  184. </para>
  185. <para>Dates before 1900 or after 5995 are invalid.
  186. </para>
  187. <para>If <literal>date</literal> is not specified or is invalid, a generated date
  188. based on the current date and time will be used, but an
  189. invalid <literal>date</literal> will also generate an error message.
  190. </para>
  191. </listitem>
  192. </varlistentry>
  193. <varlistentry><term>key = <emphasis>field_id</emphasis> [+ <emphasis>field_id</emphasis>] ... | time | count</term>
  194. <listitem>
  195. <para>Defines the unique database key for each record in a fzk file.
  196. </para>
  197. <para><emphasis>field_id</emphasis> is a field identifier declared using the
  198. <literal>field</literal> or <literal>constant</literal> directives.
  199. </para>
  200. <para>Multiple fields may be concatenated into a key.
  201. </para>
  202. <para><literal>time</literal> is a special keyword used to generate keys based
  203. on the current run date and time, plus a sequential count suffix.
  204. </para>
  205. <para><literal>count</literal> is a special keyword used to generate keys
  206. based on a sequential count of records.
  207. </para>
  208. </listitem>
  209. </varlistentry>
  210. <varlistentry><term>upper</term>
  211. <listitem>
  212. <para>Specifies that keys written by handel are to be entirely converted
  213. to upper case. Without using this directive, mixed-case keys
  214. are allowed.
  215. </para>
  216. </listitem>
  217. </varlistentry>
  218. <varlistentry><term>keychar = A | B | ...Z</term>
  219. <listitem>
  220. <para>Defines the character used to categorize keys for DtSearch. It
  221. must be an uppercase ASCII alphabetic character.
  222. </para>
  223. </listitem>
  224. </varlistentry>
  225. <varlistentry><term>delimiter = <emphasis>line_identifier</emphasis>, bottom</term>
  226. <listitem>
  227. <para>Defines the end of text (ETX) delimiter that will separate records.
  228. </para>
  229. <para><emphasis>line_identifier</emphasis> is an identifier declared with the
  230. <literal>line</literal> directive.
  231. </para>
  232. <para><literal>bottom</literal> is required. It specifies that the ETX will
  233. occur at the bottom of each record. Top of record delimiters are not
  234. supported.
  235. </para>
  236. </listitem>
  237. </varlistentry>
  238. <varlistentry><term>image = all | none</term>
  239. <listitem>
  240. <para>Defines whether the document image retrieved by
  241. <function>DtSearchRetrieve</function> is to contain all or none of the
  242. record, prior to application of <literal>imageinclude</literal> or
  243. <literal>imageexclude</literal> directives later in the han file. It
  244. defaults to <literal>all</literal>.
  245. </para>
  246. </listitem>
  247. </varlistentry>
  248. <varlistentry><term>imageinclude = <emphasis>line_identifier</emphasis> [- <emphasis>line_identifier</emphasis>]</term>
  249. <listitem>
  250. <para>Defines a line (or range of lines) to be included in the image.
  251. <emphasis>line_identifier</emphasis> is an identifier declared with the
  252. <literal>line</literal> directive.
  253. </para>
  254. </listitem>
  255. </varlistentry>
  256. <varlistentry><term>imageexclude = <emphasis>line_identifier</emphasis> [- <emphasis>line_identifier</emphasis>]</term>
  257. <listitem>
  258. <para>Defines a line (or range of lines) to be excluded from the image.
  259. <emphasis>line_identifier</emphasis> is an identifier declared with the
  260. <literal>line</literal> directive.
  261. </para>
  262. </listitem>
  263. </varlistentry>
  264. <varlistentry><term>abstract = field(s) <emphasis>field_identifier</emphasis> [+ <emphasis>field_identifier</emphasis>]...</term>
  265. <listitem>
  266. <para>Defines the abstract to be placed into the fzk file. It is created from
  267. the concatenations of fields. <emphasis>field_identifier</emphasis> is
  268. an identifier declared with the <literal>field</literal> directive.
  269. </para>
  270. </listitem>
  271. </varlistentry>
  272. <varlistentry><term>delblanklines = true | false</term>
  273. <listitem>
  274. <para>Determines if blank lines are to be removed from the record image or
  275. not. It defaults to <literal>false</literal>.
  276. </para>
  277. </listitem>
  278. </varlistentry>
  279. </variablelist>
  280. </refsect2>
  281. <refsect2>
  282. <Title>Example</Title>
  283. <para>The sample han file shown here describes a text file containing a
  284. concatenated set of man pages documents.
  285. </para>
  286. <programlisting>
  287. # All records in the incoming text file are delimited by the same
  288. # end of text convention as the default for an fzk file, namely
  289. # a linefeed (control-L) on a line by itself ("\f\n").
  290. # Define a line named "etx" with that description,
  291. # and declare it to be the &lt;delimiter>.
  292. # Note that there must be a real ASCII control-L character between
  293. # the quotes in the line below.
  294. line etx = *,"^L"
  295. delimiter = etx, bottom
  296. # The command name that the man page is describing is on the first line.
  297. # To access it we need to define a line directive for line number 1.
  298. line line1 = 1
  299. # The name of the man page command begins in column 3 of line 1,
  300. # and the length is variable. So we define a field identifier
  301. # named "command1" from column 3 to the end of the word.
  302. field command1 = line1,"",3,eow
  303. # We want each document abstract to have a constant prefix
  304. # followed by the name of the command.
  305. constant preabs = "Man Pages for "
  306. abstract = fields preabs + command1
  307. # We want all keys to be the name of the command, prefixed with
  308. # the same identifying character, an uppercase M.
  309. keychar = M
  310. key = command1
  311. # We want the each document date to be equivalent to the release
  312. # date of the original man pages, which we choose here to hard code
  313. # as November 1, 1994, at 1 o'clock in the afternoon.
  314. constant datecons = "199411011300"
  315. date = datecons
  316. </programlisting>
  317. </refsect2>
  318. </refsect1>
  319. <RefSect1>
  320. <Title>SEE ALSO</Title>
  321. <Para>&cdeman.dtsrhan;,
  322. &cdeman.dtsrindex;,
  323. &cdeman.dtsrfzkfiles;,
  324. &cdeman.dtsrlangfiles;,
  325. &cdeman.DtSearch;
  326. </Para>
  327. </RefSect1>
  328. </RefEntry>