Add MDN annotations to spec output

This change adds annotations in the margin of the spec output next to any items for which there’s a corresponding MDN article somewhere under https://developer.mozilla.org/en-US/docs/Web that has a Specifications section with a link to that item's URL. The mechanism for inserting the annotations relies on data in a JSON file containing a mapping of spec ID-attribute values to MDN article pathnames. The change adds a copy of that file to the repo, along with a makefile for updating/regenerating the JSON file. Depends on whatwg/wattsi#89 Fixes #180
whatwg · Sep 23, 2018 · 5125d6b · 5125d6b
1 parent 1d9fa63
commit 5125d6b
Show file tree

Hide file tree

Showing 5 changed files with 5,645 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@
 .temp/
 html/
 output/
+mdn/.id-list
+mdn/developer.mozilla.org/
diff --git a/build.sh b/build.sh
@@ -8,7 +8,7 @@ cd "$(dirname "$0")"
 DIR=$(pwd)
 
 # The latest required version of Wattsi. Update this if you change how ./build.sh invokes Wattsi.
-WATTSI_LATEST=70
+WATTSI_LATEST=74
 
 # Shared state variables throughout this script
 LOCAL_WATTSI=true
@@ -535,7 +535,7 @@ function runWattsi {
   if $QUIET; then
     WATTSI_ARGS+=( --quiet )
   fi
-  WATTSI_ARGS+=( "$1" "$HTML_SHA" "$2" "$BUILD_TYPE" "$HTML_CACHE/caniuse.json" "$HIGHLIGHT_SERVER_URL" )
+  WATTSI_ARGS+=( "$1" "$HTML_SHA" "$2" "$BUILD_TYPE" "$HTML_CACHE/caniuse.json" "$DIR/mdn/id-map.json" "$HIGHLIGHT_SERVER_URL" )
   if hash wattsi 2>/dev/null; then
     if [[ "$(wattsi --version | cut -d' ' -f2)" -lt "$WATTSI_LATEST" ]]; then
       echo

diff --git a/mdn/Makefile b/mdn/Makefile
@@ -0,0 +1,102 @@
+URL_REGEX=\s+<td>(?:<bdi>)?<a.*href=".+html.spec.whatwg.org[^\#]+\#([^"]*).+
+PSEUDO_CLASSES=active checked default defined disabled enabled focus hover\
+	       in-range indeterminate invalid link dir optional out-of-range\
+	       read-only read-write required target valid visited
+PERL=perl
+GREP=grep
+WGET=wget
+RG=rg
+JQ=jq
+
+id-map.json: .id-list
+	(echo "{"; COMMA=""; for ID in $$(cat .id-list); \
+	  do echo "$$COMMA\"$$ID\": [" ; COMMA=","; ARRAY_COMMA=""; \
+	  for article in \
+	  $$($(RG) -l "\s+<td>(?:<bdi>)?<a.*href=\".+html.spec.whatwg.org[^#]+#$$ID\"" \
+	    developer.mozilla.org/en-US/docs/Web | sort); do \
+	    articledata=$$(./getMNDArticleDetails.py $$article); \
+	    echo "$$ARRAY_COMMA$$articledata"; ARRAY_COMMA=","; done; \
+	    echo "]"; done; echo "}") \
+	  | $(PERL) -pe 's/\.html"/"/' \
+	  | $(PERL) -pe 's/developer\.mozilla\.org\/en-US\/docs\/Web\///' \
+	  | $(PERL) -pe 's/\x{5c}\x{28}/\(/g' \
+	  | $(PERL) -pe 's/\x{5c}\x{29}/\)/g' \
+	  | $(PERL) -pe 's/\x{5c}\x{2a}/\*/g' \
+	  | $(JQ) . > $@
+
+.id-list: developer.mozilla.org
+	$(RG) -t html --no-filename '$(URL_REGEX)' $< \
+	  | $(PERL) -pe 's/$(URL_REGEX)/$$1/' | sort  | uniq \
+	  | $(PERL) -pe 's/\(/\\(/g' \
+	  | $(PERL) -pe 's/\)/\\)/g' \
+	  | $(PERL) -pe 's/\*/\\*/g' \
+	  | sort  | uniq > $@
+	# Fail if the ID list contains any generated "back" IDs.
+	# (These need to be changed in the corresponding MDN article to
+	# some other non-back-IDs.)
+	! ($(GREP) -v "the-lang-and-xml:lang-attributes" $@ \
+	  | $(GREP) "\w:\w")
+
+developer.mozilla.org:
+	-$(WGET) --mirror --adjust-extension --trust-server-names \
+	  --include-directories=/en-US/docs/Web/HTML,/en-US/docs/Web/API \
+	  --exclude-directories='\
+/en-US/docs/Archive,\
+/en-US/docs/Mozilla,\
+/en-US/docs/Web/API/Bluetooth*,\
+/en-US/docs/Web/API/Device*,\
+/en-US/docs/Web/API/EXT*,\
+/en-US/docs/Web/API/Gamepad*,\
+/en-US/docs/Web/API/IDB*,\
+/en-US/docs/Web/API/MediaKey*,\
+/en-US/docs/Web/API/MediaStream*,\
+/en-US/docs/Web/API/MediaTrack*,\
+/en-US/docs/Web/API/OES*,\
+/en-US/docs/Web/API/Payment*,\
+/en-US/docs/Web/API/Performance*,\
+/en-US/docs/Web/API/Presentation*,\
+/en-US/docs/Web/API/RTC*,\
+/en-US/docs/Web/API/Readable*,\
+/en-US/docs/Web/API/SVG*,\
+/en-US/docs/Web/API/ServiceWorker*,\
+/en-US/docs/Web/API/Speech*,\
+/en-US/docs/Web/API/WebVR*,\
+/en-US/docs/Web/API/VR*,\
+/en-US/docs/Web/API/WEBGL*,\
+/en-US/docs/Web/API/WebGL*' \
+	  --reject '*$$edit,*$$history,*$$json,*$$locales,\
+*Bluetooth*,\
+*Device*,\
+*EXT*,\
+*Gamepad*,\
+*IDB*,\
+*MediaKey*,\
+*MediaStream*,\
+*MediaTrack*,\
+*Moz*,\
+*Ms*,\
+*OES*,\
+*Performance*,\
+*Presentation*,\
+*RTC*,\
+*Readable*,\
+*SVG*,\
+*ServiceWorker*,\
+*Speech*,\
+*VR*,\
+*WEBGL*,\
+*WebGL*' \
+	  https://developer.mozilla.org/en-US/docs/Web
+	mkdir $@/en-US/docs/Web/CSS
+	for PSEUDO_CLASS in $(PSEUDO_CLASSES); do \
+	  $(WGET) --adjust-extension \
+	  https://developer.mozilla.org/en-US/docs/Web/CSS/:$$PSEUDO_CLASS; \
+	  mv :$$PSEUDO_CLASS.html $@/en-US/docs/Web/CSS/; \
+	done
+	$(WGET) --adjust-extension \
+	  https://developer.mozilla.org/en-US/docs/Web/CSS/Pseudo-classes
+	mv Pseudo-classes.html $@/en-US/docs/Web/CSS/
+
+clean:
+	rm -f .id-list
+	rm -rf developer.mozilla.org
diff --git a/mdn/getMNDArticleDetails.py b/mdn/getMNDArticleDetails.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python2
+import sys
+from lxml.html import parse
+
+
+def normalize(string):
+    return string.encode('utf-8') \
+        .replace('"', '\\"') \
+        .replace("\xc2\xa0", " ") \
+        .replace('\n', ' ') \
+        .strip()
+
+mdnpath = sys.argv[1]
+doc = parse(mdnpath)
+firstParagraphXPath = '//article/p[string-length(text()) > 0][1]//text()'
+title = normalize(''.join(doc.xpath('/html/head/title/text()'))
+                  .split(" - ")[0].split(": ")[0])
+summary = normalize(''.join(doc.xpath('//*[@class="seoSummary"]//text()')))
+if summary == '':
+    # Found no seoSummary, so get text of class=summary paragraph.
+    summary = normalize(''.join(doc.xpath('//*[@class="summary"]//text()')))
+if summary == '':
+    # Found no seoSummary or summary, so get text of the first <p> of article.
+    summary = normalize(''.join(doc.xpath(firstParagraphXPath)))
+print '["' + mdnpath + '","' + title + '","' + summary + '"]'
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,5 @@ @@
     .temp/
     html/
     output/
+    mdn/.id-list
+    mdn/developer.mozilla.org/