crawl.ts 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. import { ACTIVITY_PUB, JOB_REQUEST_TIMEOUT, WEBSERVER } from '../../initializers/constants'
  2. import { doRequest } from '../../helpers/requests'
  3. import { logger } from '../../helpers/logger'
  4. import * as Bluebird from 'bluebird'
  5. import { ActivityPubOrderedCollection } from '../../../shared/models/activitypub'
  6. import { checkUrlsSameHost } from '../../helpers/activitypub'
  7. import { parse } from "url"
  8. type HandlerFunction<T> = (items: T[]) => (Promise<any> | Bluebird<any>)
  9. type CleanerFunction = (startedDate: Date) => (Promise<any> | Bluebird<any>)
  10. async function crawlCollectionPage <T> (uri: string, handler: HandlerFunction<T>, cleaner?: CleanerFunction) {
  11. logger.info('Crawling ActivityPub data on %s.', uri)
  12. const options = {
  13. method: 'GET',
  14. uri,
  15. json: true,
  16. activityPub: true,
  17. timeout: JOB_REQUEST_TIMEOUT
  18. }
  19. const startDate = new Date()
  20. const response = await doRequest<ActivityPubOrderedCollection<T>>(options)
  21. const firstBody = response.body
  22. let limit = ACTIVITY_PUB.FETCH_PAGE_LIMIT
  23. let i = 0
  24. let nextLink = firstBody.first
  25. while (nextLink && i < limit) {
  26. // Don't crawl ourselves
  27. const remoteHost = parse(nextLink).host
  28. if (remoteHost === WEBSERVER.HOST) continue
  29. options.uri = nextLink
  30. const { body } = await doRequest<ActivityPubOrderedCollection<T>>(options)
  31. nextLink = body.next
  32. i++
  33. if (Array.isArray(body.orderedItems)) {
  34. const items = body.orderedItems
  35. logger.info('Processing %i ActivityPub items for %s.', items.length, options.uri)
  36. await handler(items)
  37. }
  38. }
  39. if (cleaner) await cleaner(startDate)
  40. }
  41. export {
  42. crawlCollectionPage
  43. }