crawl.ts 1.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import { ACTIVITY_PUB, JOB_REQUEST_TIMEOUT } from '../../initializers'
  2. import { doRequest } from '../../helpers/requests'
  3. import { logger } from '../../helpers/logger'
  4. import * as Bluebird from 'bluebird'
  5. async function crawlCollectionPage <T> (uri: string, handler: (items: T[]) => Promise<any> | Bluebird<any>) {
  6. logger.info('Crawling ActivityPub data on %s.', uri)
  7. const options = {
  8. method: 'GET',
  9. uri,
  10. json: true,
  11. activityPub: true,
  12. timeout: JOB_REQUEST_TIMEOUT
  13. }
  14. const response = await doRequest(options)
  15. const firstBody = response.body
  16. let limit = ACTIVITY_PUB.FETCH_PAGE_LIMIT
  17. let i = 0
  18. let nextLink = firstBody.first
  19. while (nextLink && i < limit) {
  20. options.uri = nextLink
  21. const { body } = await doRequest(options)
  22. nextLink = body.next
  23. i++
  24. if (Array.isArray(body.orderedItems)) {
  25. const items = body.orderedItems
  26. logger.info('Processing %i ActivityPub items for %s.', items.length, options.uri)
  27. await handler(items)
  28. }
  29. }
  30. }
  31. export {
  32. crawlCollectionPage
  33. }