IndexDocument.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774
  1. <?php
  2. declare(strict_types=1);
  3. /**
  4. * SPDX-FileCopyrightText: 2019 Nextcloud GmbH and Nextcloud contributors
  5. * SPDX-License-Identifier: AGPL-3.0-or-later
  6. */
  7. namespace OC\FullTextSearch\Model;
  8. use JsonSerializable;
  9. use OCP\FullTextSearch\Exceptions\FullTextSearchIndexNotAvailableException;
  10. use OCP\FullTextSearch\Model\IDocumentAccess;
  11. use OCP\FullTextSearch\Model\IIndex;
  12. use OCP\FullTextSearch\Model\IIndexDocument;
  13. /**
  14. * Class IndexDocument
  15. *
  16. * This is one of the main class of the FullTextSearch, used as a data transfer
  17. * object. An IndexDocument is created to manage documents around FullTextSearch,
  18. * during an index and during a search.
  19. * The uniqueness of an IndexDocument is made by the Id of the Content Provider
  20. * and the Id of the original document within the Content Provider.
  21. *
  22. * We will call original document the source from which the IndexDocument is
  23. * generated. As an example, an original document can be a file, a mail, ...
  24. *
  25. * @since 15.0.0
  26. *
  27. * @package OC\FullTextSearch\Model
  28. */
  29. class IndexDocument implements IIndexDocument, JsonSerializable {
  30. protected string $id = '';
  31. protected DocumentAccess $access;
  32. protected ?IIndex $index = null;
  33. protected int $modifiedTime = 0;
  34. protected string $source = '';
  35. protected array $tags = [];
  36. protected array $metaTags = [];
  37. protected array $subTags = [];
  38. protected string $title = '';
  39. protected string $content = '';
  40. protected string $hash = '';
  41. protected array $parts = [];
  42. protected string $link = '';
  43. protected array $more = [];
  44. protected array $excerpts = [];
  45. protected string $score = '';
  46. protected array $info = [];
  47. protected int $contentEncoded = 0;
  48. /**
  49. * IIndexDocument constructor.
  50. *
  51. * On creation, we assure the uniqueness of the object using the providerId
  52. * and the Id of the original document.
  53. *
  54. * @since 15.0.0
  55. */
  56. public function __construct(
  57. protected string $providerId,
  58. string $documentId,
  59. ) {
  60. $this->id = $documentId;
  61. }
  62. /**
  63. * Returns the Id of the original document.
  64. *
  65. * @since 15.0.0
  66. */
  67. final public function getId(): string {
  68. return $this->id;
  69. }
  70. /**
  71. * Returns the Id of the provider.
  72. *
  73. * @since 15.0.0
  74. */
  75. final public function getProviderId(): string {
  76. return $this->providerId;
  77. }
  78. /**
  79. * Set the Index related to the IIndexDocument.
  80. *
  81. * @see IIndex
  82. *
  83. * @since 15.0.0
  84. */
  85. final public function setIndex(IIndex $index): IIndexDocument {
  86. $this->index = $index;
  87. return $this;
  88. }
  89. /**
  90. * Get the Index.
  91. *
  92. * @throws FullTextSearchIndexNotAvailableException
  93. * @since 15.0.0
  94. */
  95. final public function getIndex(): IIndex {
  96. if ($this->index === null) {
  97. throw new FullTextSearchIndexNotAvailableException('No IIndex generated');
  98. }
  99. return $this->index;
  100. }
  101. /**
  102. * return if Index is defined.
  103. *
  104. * @since 16.0.0
  105. */
  106. final public function hasIndex(): bool {
  107. return $this->index !== null;
  108. }
  109. /**
  110. * Set the modified time of the original document.
  111. *
  112. * @since 15.0.0
  113. */
  114. final public function setModifiedTime(int $modifiedTime): IIndexDocument {
  115. $this->modifiedTime = $modifiedTime;
  116. return $this;
  117. }
  118. /**
  119. * Get the modified time of the original document.
  120. *
  121. * @since 15.0.0
  122. */
  123. final public function getModifiedTime(): int {
  124. return $this->modifiedTime;
  125. }
  126. /**
  127. * Check if the original document of the IIndexDocument is older than $time.
  128. *
  129. * @since 15.0.0
  130. */
  131. final public function isOlderThan(int $time): bool {
  132. return ($this->modifiedTime < $time);
  133. }
  134. /**
  135. * Set the read rights of the original document using a IDocumentAccess.
  136. *
  137. * @see IDocumentAccess
  138. *
  139. * @since 15.0.0
  140. */
  141. final public function setAccess(IDocumentAccess $access): IIndexDocument {
  142. $this->access = $access;
  143. return $this;
  144. }
  145. /**
  146. * Get the IDocumentAccess related to the original document.
  147. *
  148. * @since 15.0.0
  149. */
  150. final public function getAccess(): IDocumentAccess {
  151. return $this->access;
  152. }
  153. /**
  154. * Add a tag to the list.
  155. *
  156. * @since 15.0.0
  157. */
  158. final public function addTag(string $tag): IIndexDocument {
  159. $this->tags[] = $tag;
  160. return $this;
  161. }
  162. /**
  163. * Set the list of tags assigned to the original document.
  164. *
  165. * @since 15.0.0
  166. */
  167. final public function setTags(array $tags): IIndexDocument {
  168. $this->tags = $tags;
  169. return $this;
  170. }
  171. /**
  172. * Get the list of tags assigned to the original document.
  173. *
  174. * @since 15.0.0
  175. */
  176. final public function getTags(): array {
  177. return $this->tags;
  178. }
  179. /**
  180. * Add a meta tag to the list.
  181. *
  182. * @since 15.0.0
  183. */
  184. final public function addMetaTag(string $tag): IIndexDocument {
  185. $this->metaTags[] = $tag;
  186. return $this;
  187. }
  188. /**
  189. * Set the list of meta tags assigned to the original document.
  190. *
  191. * @since 15.0.0
  192. */
  193. final public function setMetaTags(array $tags): IIndexDocument {
  194. $this->metaTags = $tags;
  195. return $this;
  196. }
  197. /**
  198. * Get the list of meta tags assigned to the original document.
  199. *
  200. * @since 15.0.0
  201. */
  202. final public function getMetaTags(): array {
  203. return $this->metaTags;
  204. }
  205. /**
  206. * Add a sub tag to the list.
  207. *
  208. * @since 15.0.0
  209. */
  210. final public function addSubTag(string $sub, string $tag): IIndexDocument {
  211. if (!array_key_exists($sub, $this->subTags)) {
  212. $this->subTags[$sub] = [];
  213. }
  214. $this->subTags[$sub][] = $tag;
  215. return $this;
  216. }
  217. /**
  218. * Set the list of sub tags assigned to the original document.
  219. *
  220. * @since 15.0.0
  221. */
  222. final public function setSubTags(array $tags): IIndexDocument {
  223. $this->subTags = $tags;
  224. return $this;
  225. }
  226. /**
  227. * Get the list of sub tags assigned to the original document.
  228. * If $formatted is true, the result will be formatted in a one
  229. * dimensional array.
  230. *
  231. * @since 15.0.0
  232. */
  233. final public function getSubTags(bool $formatted = false): array {
  234. if ($formatted === false) {
  235. return $this->subTags;
  236. }
  237. $subTags = [];
  238. $ak = array_keys($this->subTags);
  239. foreach ($ak as $source) {
  240. $tags = $this->subTags[$source];
  241. foreach ($tags as $tag) {
  242. $subTags[] = $source . '_' . $tag;
  243. }
  244. }
  245. return $subTags;
  246. }
  247. /**
  248. * Set the source of the original document.
  249. *
  250. * @since 15.0.0
  251. */
  252. final public function setSource(string $source): IIndexDocument {
  253. $this->source = $source;
  254. return $this;
  255. }
  256. /**
  257. * Get the source of the original document.
  258. *
  259. * @since 15.0.0
  260. */
  261. final public function getSource(): string {
  262. return $this->source;
  263. }
  264. /**
  265. * Set the title of the original document.
  266. *
  267. * @since 15.0.0
  268. */
  269. final public function setTitle(string $title): IIndexDocument {
  270. $this->title = $title;
  271. return $this;
  272. }
  273. /**
  274. * Get the title of the original document.
  275. *
  276. * @since 15.0.0
  277. */
  278. final public function getTitle(): string {
  279. return $this->title;
  280. }
  281. /**
  282. * Set the content of the document.
  283. * $encoded can be NOT_ENCODED or ENCODED_BASE64 if the content is raw or
  284. * encoded in base64.
  285. *
  286. * @since 15.0.0
  287. */
  288. final public function setContent(string $content, int $encoded = 0): IIndexDocument {
  289. $this->content = $content;
  290. $this->contentEncoded = $encoded;
  291. return $this;
  292. }
  293. /**
  294. * Get the content of the original document.
  295. *
  296. * @since 15.0.0
  297. */
  298. final public function getContent(): string {
  299. return $this->content;
  300. }
  301. /**
  302. * Returns the type of the encoding on the content.
  303. *
  304. * @since 15.0.0
  305. */
  306. final public function isContentEncoded(): int {
  307. return $this->contentEncoded;
  308. }
  309. /**
  310. * Return the size of the content.
  311. *
  312. * @since 15.0.0
  313. */
  314. final public function getContentSize(): int {
  315. return strlen($this->getContent());
  316. }
  317. /**
  318. * Generate a hash, based on the content of the original document.
  319. *
  320. * @since 15.0.0
  321. */
  322. final public function initHash(): IIndexDocument {
  323. if ($this->getContent() === '' || is_null($this->getContent())) {
  324. return $this;
  325. }
  326. $this->hash = hash("md5", $this->getContent());
  327. return $this;
  328. }
  329. /**
  330. * Set the hash of the original document.
  331. *
  332. * @since 15.0.0
  333. */
  334. final public function setHash(string $hash): IIndexDocument {
  335. $this->hash = $hash;
  336. return $this;
  337. }
  338. /**
  339. * Get the hash of the original document.
  340. *
  341. * @since 15.0.0
  342. */
  343. final public function getHash(): string {
  344. return $this->hash;
  345. }
  346. /**
  347. * Add a part, identified by a string, and its content.
  348. *
  349. * It is strongly advised to use alphanumerical chars with no space in the
  350. * $part string.
  351. *
  352. * @since 15.0.0
  353. */
  354. final public function addPart(string $part, string $content): IIndexDocument {
  355. $this->parts[$part] = $content;
  356. return $this;
  357. }
  358. /**
  359. * Set all parts and their content.
  360. *
  361. * @since 15.0.0
  362. */
  363. final public function setParts(array $parts): IIndexDocument {
  364. $this->parts = $parts;
  365. return $this;
  366. }
  367. /**
  368. * Get all parts of the IIndexDocument.
  369. *
  370. * @since 15.0.0
  371. */
  372. final public function getParts(): array {
  373. return $this->parts;
  374. }
  375. /**
  376. * Add a link, usable by the frontend.
  377. *
  378. * @since 15.0.0
  379. */
  380. final public function setLink(string $link): IIndexDocument {
  381. $this->link = $link;
  382. return $this;
  383. }
  384. /**
  385. * Get the link.
  386. *
  387. * @since 15.0.0
  388. */
  389. final public function getLink(): string {
  390. return $this->link;
  391. }
  392. /**
  393. * Set more information that couldn't be set using other method.
  394. *
  395. * @since 15.0.0
  396. */
  397. final public function setMore(array $more): IIndexDocument {
  398. $this->more = $more;
  399. return $this;
  400. }
  401. /**
  402. * Get more information.
  403. *
  404. * @since 15.0.0
  405. */
  406. final public function getMore(): array {
  407. return $this->more;
  408. }
  409. /**
  410. * Add some excerpt of the content of the original document, usually based
  411. * on the search request.
  412. *
  413. * @since 16.0.0
  414. */
  415. final public function addExcerpt(string $source, string $excerpt): IIndexDocument {
  416. $this->excerpts[] =
  417. [
  418. 'source' => $source,
  419. 'excerpt' => $this->cleanExcerpt($excerpt)
  420. ];
  421. return $this;
  422. }
  423. /**
  424. * Set all excerpts of the content of the original document.
  425. *
  426. * @since 16.0.0
  427. */
  428. final public function setExcerpts(array $excerpts): IIndexDocument {
  429. $new = [];
  430. foreach ($excerpts as $entry) {
  431. $new[] = [
  432. 'source' => $entry['source'],
  433. 'excerpt' => $this->cleanExcerpt($entry['excerpt'])
  434. ];
  435. }
  436. $this->excerpts = $new;
  437. return $this;
  438. }
  439. /**
  440. * Get all excerpts of the content of the original document.
  441. *
  442. * @since 15.0.0
  443. */
  444. final public function getExcerpts(): array {
  445. return $this->excerpts;
  446. }
  447. /**
  448. * Clean excerpt.
  449. *
  450. * @since 16.0.0
  451. */
  452. private function cleanExcerpt(string $excerpt): string {
  453. $excerpt = str_replace("\\n", ' ', $excerpt);
  454. $excerpt = str_replace("\\r", ' ', $excerpt);
  455. $excerpt = str_replace("\\t", ' ', $excerpt);
  456. $excerpt = str_replace("\n", ' ', $excerpt);
  457. $excerpt = str_replace("\r", ' ', $excerpt);
  458. $excerpt = str_replace("\t", ' ', $excerpt);
  459. return $excerpt;
  460. }
  461. /**
  462. * Set the score to the result assigned to this document during a search
  463. * request.
  464. *
  465. * @since 15.0.0
  466. */
  467. final public function setScore(string $score): IIndexDocument {
  468. $this->score = $score;
  469. return $this;
  470. }
  471. /**
  472. * Get the score.
  473. *
  474. * @since 15.0.0
  475. */
  476. final public function getScore(): string {
  477. return $this->score;
  478. }
  479. /**
  480. * Set some information about the original document that will be available
  481. * to the front-end when displaying search result. (as string)
  482. * Because this information will not be indexed, this method can also be
  483. * used to manage some data while filling the IIndexDocument before its
  484. * indexing.
  485. *
  486. * @since 15.0.0
  487. */
  488. final public function setInfo(string $info, string $value): IIndexDocument {
  489. $this->info[$info] = $value;
  490. return $this;
  491. }
  492. /**
  493. * Get an information about a document. (string)
  494. *
  495. * @since 15.0.0
  496. */
  497. final public function getInfo(string $info, string $default = ''): string {
  498. if (!key_exists($info, $this->info)) {
  499. return $default;
  500. }
  501. return $this->info[$info];
  502. }
  503. /**
  504. * Set some information about the original document that will be available
  505. * to the front-end when displaying search result. (as array)
  506. * Because this information will not be indexed, this method can also be
  507. * used to manage some data while filling the IIndexDocument before its
  508. * indexing.
  509. *
  510. * @since 15.0.0
  511. */
  512. final public function setInfoArray(string $info, array $value): IIndexDocument {
  513. $this->info[$info] = $value;
  514. return $this;
  515. }
  516. /**
  517. * Get an information about a document. (array)
  518. *
  519. * @since 15.0.0
  520. */
  521. final public function getInfoArray(string $info, array $default = []): array {
  522. if (!key_exists($info, $this->info)) {
  523. return $default;
  524. }
  525. return $this->info[$info];
  526. }
  527. /**
  528. * Set some information about the original document that will be available
  529. * to the front-end when displaying search result. (as int)
  530. * Because this information will not be indexed, this method can also be
  531. * used to manage some data while filling the IIndexDocument before its
  532. * indexing.
  533. *
  534. * @since 15.0.0
  535. */
  536. final public function setInfoInt(string $info, int $value): IIndexDocument {
  537. $this->info[$info] = $value;
  538. return $this;
  539. }
  540. /**
  541. * Get an information about a document. (int)
  542. *
  543. * @since 15.0.0
  544. */
  545. final public function getInfoInt(string $info, int $default = 0): int {
  546. if (!key_exists($info, $this->info)) {
  547. return $default;
  548. }
  549. return $this->info[$info];
  550. }
  551. /**
  552. * Set some information about the original document that will be available
  553. * to the front-end when displaying search result. (as bool)
  554. * Because this information will not be indexed, this method can also be
  555. * used to manage some data while filling the IIndexDocument before its
  556. * indexing.
  557. *
  558. * @since 15.0.0
  559. */
  560. final public function setInfoBool(string $info, bool $value): IIndexDocument {
  561. $this->info[$info] = $value;
  562. return $this;
  563. }
  564. /**
  565. * Get an information about a document. (bool)
  566. *
  567. * @since 15.0.0
  568. */
  569. final public function getInfoBool(string $info, bool $default = false): bool {
  570. if (!key_exists($info, $this->info)) {
  571. return $default;
  572. }
  573. return $this->info[$info];
  574. }
  575. /**
  576. * Get all info.
  577. *
  578. * @since 15.0.0
  579. */
  580. final public function getInfoAll(): array {
  581. $info = [];
  582. foreach ($this->info as $k => $v) {
  583. if (str_starts_with($k, '_')) {
  584. continue;
  585. }
  586. $info[$k] = $v;
  587. }
  588. return $info;
  589. }
  590. /**
  591. * @since 15.0.0
  592. *
  593. * On some version of PHP, it is better to force destruct the object.
  594. * And during the index, the number of generated IIndexDocument can be
  595. * _huge_.
  596. */
  597. public function __destruct() {
  598. unset($this->id);
  599. unset($this->providerId);
  600. unset($this->access);
  601. unset($this->modifiedTime);
  602. unset($this->title);
  603. unset($this->content);
  604. unset($this->hash);
  605. unset($this->link);
  606. unset($this->source);
  607. unset($this->tags);
  608. unset($this->metaTags);
  609. unset($this->subTags);
  610. unset($this->more);
  611. unset($this->excerpts);
  612. unset($this->score);
  613. unset($this->info);
  614. unset($this->contentEncoded);
  615. }
  616. /**
  617. * @since 15.0.0
  618. */
  619. public function jsonSerialize(): array {
  620. return [
  621. 'id' => $this->getId(),
  622. 'providerId' => $this->getProviderId(),
  623. 'access' => $this->access,
  624. 'modifiedTime' => $this->getModifiedTime(),
  625. 'title' => $this->getTitle(),
  626. 'link' => $this->getLink(),
  627. 'index' => $this->index,
  628. 'source' => $this->getSource(),
  629. 'info' => $this->getInfoAll(),
  630. 'hash' => $this->getHash(),
  631. 'contentSize' => $this->getContentSize(),
  632. 'tags' => $this->getTags(),
  633. 'metatags' => $this->getMetaTags(),
  634. 'subtags' => $this->getSubTags(),
  635. 'more' => $this->getMore(),
  636. 'excerpts' => $this->getExcerpts(),
  637. 'score' => $this->getScore()
  638. ];
  639. }
  640. }