Explorar o código

rework the search indexing process

- reconfigure the docsearch scraper to store component and version for each res
- switch from the Docker image to a resuable GitHub Action
- add publish-docsearch-config extension to transform Handlebars into YAML
Dan Allen %!s(int64=2) %!d(string=hai) anos
pai
achega
409bd29abd

+ 0 - 20
.github/actions/algolia-config.json

@@ -1,20 +0,0 @@
-{
-  "index_name": "security-docs",
-  "start_urls": [
-    "https://docs.spring.io/spring-security/reference/"
-  ],
-  "selectors": {
-    "lvl0": {
-      "selector": "//nav[@class='crumbs']//li[@class='crumb'][last()-1]",
-      "type": "xpath",
-      "global": true,
-      "default_value": "Home"
-    },
-    "lvl1": ".doc h1",
-    "lvl2": ".doc h2",
-    "lvl3": ".doc h3",
-    "lvl4": ".doc h4",
-    "text": ".doc p, .doc td.content, .doc th.tableblock"
-  }
-}
-

+ 0 - 21
.github/actions/algolia-docsearch-scraper.sh

@@ -1,21 +0,0 @@
-#!/bin/bash
-
-###
-# Docs
-# config.json https://docsearch.algolia.com/docs/config-file
-# Run the crawler https://docsearch.algolia.com/docs/run-your-own/#run-the-crawl-from-the-docker-image
-
-### USAGE
-if [ "$#" -ne 3 ]; then
-  echo -e "not enough arguments USAGE:\n\n$0 \$ALGOLIA_APPLICATION_ID \$ALGOLIA_API_KEY \$CONFIG_FILE\n\n" >&2
-  exit 1
-fi
-
-# Script Parameters
-APPLICATION_ID=$1
-API_KEY=$2
-CONFIG_FILE=$3
-
-#### Script
-script_dir=$(dirname $0)
-docker run -e "APPLICATION_ID=$APPLICATION_ID" -e "API_KEY=$API_KEY" -e "CONFIG=$(cat $CONFIG_FILE | jq -r tostring)" algolia/docsearch-scraper

+ 67 - 0
.github/actions/docsearch-config.json.hbs

@@ -0,0 +1,67 @@
+{
+  "index_name": "spring-security-docs",
+  "start_urls": [
+    {{#each components}}
+    {{#each versions}}
+    {
+      "url": "{{{@root.site.url}}}/{{#if (eq ./activeVersionSegment '')}}(?:$|index.html$|[a-z].*){{else}}{{{./activeVersionSegment}}}/{{/if}}",
+      "extra_attributes": {
+        "component": "{{#if (eq ./name 'ROOT')}}spring-security{{else}}{{{./name}}}{{/if}}",
+        "version": "{{{./version}}}",
+        "version_rank": {{#if (eq this ../latest)}}1{{else}}2{{/if}}
+      }
+    }{{#unless (and @last @../last)}},{{/unless}}
+    {{/each}}
+    {{/each}}
+  ],
+  "sitemap_urls": [
+    "{{{site.url}}}/sitemap.xml"
+  ],
+  "scrape_start_urls": true,
+  "stop_urls": [
+    {{#each stopPages}}
+    "{{{@root.site.url}}}{{{./pub.url}}}"{{#unless @last}},{{/unless}}
+    {{/each}}
+  ],
+  "selectors": {
+    "default": {
+      "lvl0": {
+        "global": true,
+        "selector": ".nav-panel-explore .context .title, .nav-panel-explore .context .version"
+      },
+      "lvl1": ".doc > h1.page",
+      "lvl2": ".doc .sect1 > h2:first-child",
+      "lvl3": ".doc .sect2 > h3:first-child",
+      "lvl4": ".doc .sect3 > h4:first-child",
+      "text": ".doc p, .doc dt, .doc td.content, .doc th.tableblock"
+    }
+  },
+  "selectors_exclude": [
+    "#section-summary"
+  ],
+  "min_indexed_level": 1,
+  "custom_settings": {
+    "advancedSyntax": true,
+    "attributesForFaceting": [
+      "component",
+      "version"
+    ],
+    "attributesToRetrieve": [
+      "anchor",
+      "content",
+      "hierarchy",
+      "url",
+      "component",
+      "version"
+    ],
+    "attributesToSnippet": [
+      "content:25"
+    ],
+    "customRanking": [
+      "desc(weight.page_rank)",
+      "asc(version_rank)",
+      "desc(weight.level)",
+      "asc(weight.position)"
+    ]
+  }
+}

+ 16 - 3
.github/workflows/rebuild-search-index.yml

@@ -8,8 +8,21 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: Checkout
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
       with:
         fetch-depth: 5
-    - name: Run Docsearch Scraper
-      run: $GITHUB_WORKSPACE/.github/actions/algolia-docsearch-scraper.sh "${{ secrets.ALGOLIA_APPLICATION_ID }}" "${{ secrets.ALGOLIA_WRITE_API_KEY }}" $GITHUB_WORKSPACE/.github/actions/algolia-config.json
+    - name: Configure Indexer
+      run: |
+        CONFIG_FILE=.github/actions/docsearch-config.json
+        if [ ! -f $CONFIG_FILE ]; then
+          curl -sL -o $CONFIG_FILE $(node -p "require('fs').readFileSync('antora-playbook.yml', 'utf8').match(/^  url: (.*)/m)[1]")/docsearch-config.json
+        fi
+        INDEX_NAME=$(node -p "JSON.parse(require('fs').readFileSync('$CONFIG_FILE')).index_name")
+        echo "CONFIG_FILE=${CONFIG_FILE}" >> $GITHUB_ENV
+        echo "INDEX_NAME_TMP=${INDEX_NAME}-${GITHUB_RUN_ID}" >> $GITHUB_ENV
+    - name: Run Indexer
+      uses: darrenjennings/algolia-docsearch-action@master
+      with:
+        algolia_application_id: ${{ secrets.ALGOLIA_APP_ID }}
+        algolia_api_key: ${{ secrets.ALGOLIA_API_KEY }}
+        file: ${{ env.CONFIG_FILE }}

+ 43 - 0
antora-playbook-for-indexing.yml

@@ -0,0 +1,43 @@
+antora:
+  extensions:
+  - '@springio/antora-extensions/partial-build-extension'
+  - ./lib/antora/extensions/inject-collector-config.js
+  - '@antora/collector-extension'
+  - ./lib/antora/extensions/version-fix.js
+  - '@antora/atlas-extension'
+  - '@opendevise/antora-release-line-extension'
+  - require: '@springio/antora-extensions/tabs-migration-extension'
+    # uncomment this option to save the migrated content to the worktree
+    #save_result: true
+    unwrap_example_block: always
+  - id: publish-docsearch-config
+    require: ./lib/antora/extensions/publish-docsearch-config
+    template_path: ./.github/actions/docsearch-config.json.hbs
+site:
+  title: Spring Security
+  url: https://docs.spring.io/spring-security/reference
+  robots: allow
+git:
+  ensure_git_suffix: false
+content:
+  sources:
+  - url: https://github.com/spring-projects/spring-security
+    branches: main
+    tags: 6.0.1
+    start_path: docs
+asciidoc:
+  attributes:
+    page-pagination: ''
+    hide-uri-scheme: '@'
+    tabs-sync-option: '@'
+  extensions:
+  - '@asciidoctor/tabs'
+  - '@springio/asciidoctor-extensions'
+urls:
+  latest_version_segment_strategy: redirect:to
+  latest_version_segment: ''
+  redirect_facility: httpd
+ui:
+  bundle:
+    url: https://github.com/spring-io/antora-ui-spring/releases/download/latest/ui-bundle.zip
+    snapshot: true

+ 2 - 0
antora-playbook.yml

@@ -10,6 +10,8 @@ antora:
     # uncomment this option to save the migrated content to the worktree
     #save_result: true
     unwrap_example_block: always
+  - require: ./lib/antora/extensions/publish-docsearch-config
+    template_path: ./.github/actions/docsearch-config.json.hbs
 site:
   title: Spring Security
   url: https://docs.spring.io/spring-security/reference

+ 3 - 3
build.gradle

@@ -4,13 +4,13 @@ plugins {
 }
 
 antora {
-    version = '3.2.0-alpha.2'
+	version = '3.2.0-alpha.2'
 	options = ['--clean', '--fetch', '--stacktrace']
 	environment = [
 		'ALGOLIA_API_KEY': '82c7ead946afbac3cf98c32446154691',
 		'ALGOLIA_APP_ID': '244V8V9FGG',
-		'ALGOLIA_INDEX_NAME': 'security-docs',
-    ]
+		'ALGOLIA_INDEX_NAME': 'spring-security-docs',
+	]
 	dependencies = [
 		'@antora/atlas-extension': '1.0.0-alpha.1',
 		'@antora/collector-extension': '1.0.0-alpha.2',

+ 28 - 0
lib/antora/extensions/publish-docsearch-config.js

@@ -0,0 +1,28 @@
+'use strict'
+
+const fsp = require('node:fs/promises')
+const ospath = require('node:path')
+
+/**
+ * An Antora extension that generates the docsearch config file from a Handlebars template and publishes it with the
+ * site, where the scraper job can retrieve it.
+ */
+module.exports.register = function ({ config: { templatePath = './docsearch/config.json.hbs' } }) {
+  const expandPath = this.require('@antora/expand-path-helper')
+  const handlebars = this.require('handlebars').create()
+  handlebars.registerHelper('eq', (a, b) => a === b)
+  handlebars.registerHelper('and', (a, b) => a && b)
+
+  this.on('beforePublish', async ({ playbook, contentCatalog, siteCatalog }) => {
+    templatePath = expandPath(templatePath, { dot: playbook.dir })
+    const templateSrc = await fsp.readFile(templatePath, 'utf8')
+    const templateBasename = ospath.basename(templatePath)
+    const template = handlebars.compile(templateSrc, { noEscape: true, preventIndent: true, srcName: templateBasename })
+    const components = contentCatalog.getComponentsSortedBy('name').filter((component) => component.latest.version)
+    const stopPages = contentCatalog.getPages((page) => {
+      return page.out && ('page-archived' in page.asciidoc.attributes || 'page-noindex' in page.asciidoc.attributes)
+    })
+    const compiled = template({ components, site: playbook.site, stopPages })
+    siteCatalog.addFile({ contents: Buffer.from(compiled), out: { path: 'docsearch-config.json' } })
+  })
+}