fix(editor): preserve U+00A0 non-breaking space (#3037) (#7585)

* fix(editor): preserve U+00A0 non-breaking space (#3037) Non-breaking spaces were silently normalized to regular spaces at every ingestion point, so typed/pasted/imported nbsps never reached the changeset and users could not glue words against line-wrap in French or other languages that require nbsp typography. Removed the four strip sites that replaced U+00A0 with U+0020: - src/node/db/Pad.ts cleanText - src/static/js/contentcollector.ts textify - src/static/js/ace2_inner.ts textify - src/static/js/ace2_inner.ts importText raw-text guard Updated both processSpaces functions (domline and ExportHtml) to tokenize U+00A0 as a separate unit, emit it verbatim as  , and treat it as content (not whitespace) for the run-collapse bookkeeping so adjacent regular-space runs aren't miscounted. Added backend round-trip tests for spliceText and setText, and extended the cleanText case table. Updated the existing contentcollector and importexport specs whose expectations encoded the previous buggy behavior; they now assert genuine nbsp preservation. Verified manually in Firefox: clipboard U+00A0 → paste → pad → getText returns c2 a0; getHTML emits `100 km`. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * fix(contentcollector): collapse display-artifact nbsp runs on DOM read-back processSpaces is a lossy one-way display transform: leading/trailing spaces and all-but-the-last of a run get rendered as   so HTML doesn't collapse them. When incorporateUserChanges reads text back from the DOM, those display-artifact nbsps were being stored in the changeset model instead of being normalized back to plain spaces. This broke handleReturnIndentation, whose /^ *(?:)/ regex only matches ASCII spaces: auto-indent after `foo:\n` produced 4 spaces instead of the expected prev-indent (2) + THE_TAB (4) = 6, because the previous line's model had nbsps where it used to have spaces. Fix: in contentcollector.textify, collapse any [ ]+ run back to plain spaces UNLESS the run is pure U+00A0 AND strictly interior to word chars. That preserves user-intended typographic nbsps like "100 km" while undoing the one-way display transform. Updated 7 contentcollector tests and 7 importexport tests whose assertions needed to reflect the new rule (boundary/mixed runs collapse; pure-interior nbsp runs preserve). Fixes the Playwright regression in indentation.spec.ts:117 that the previous commit introduced. * fix(contentcollector): canonicalize nbsp runs at line assembly, not per text node Addresses Qodo code review feedback on PR #7585. ## Bug fix — nbsp lost at DOM text-node boundary The previous approach ran the "collapse display-artifact nbsp" rule inside textify(), which is called per individual DOM TEXT_NODE. A user-intended nbsp sitting at a text-node boundary (e.g., <span>100</span><span> km </span>) was incorrectly seen as non-interior (before === '' for the second text node) and normalized back to a regular space. Fix: move the canonicalization out of textify() and run it on each fully assembled line string inside cc.finish(). The rule remains: [ ]+ run -> plain spaces UNLESS pure U+00A0 AND strictly interior to non-ws chars It is length-preserving, so attribute offsets and line lengths are unaffected. Added a regression test (contentcollector.spec.ts) for the cross-span case. ## Docs concern Reverted the type-only addition of spliceText to PadType. spliceText is an existing Pad runtime method; the backend test now uses a cast (`(pad as any).spliceText`) so the PR does not expand the declared public type surface, avoiding a separate documentation requirement. --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 04:06:37 +02:00 · 2026-04-23 08:57:30 +01:00 · 2026-04-23 08:57:30 +01:00 · 67e542d2b9
commit 67e542d2b9
parent de5feb2eb5
8 changed files with 85 additions and 17 deletions
--- a/src/node/db/Pad.ts
+++ b/src/node/db/Pad.ts
@ -51,8 +51,7 @@ type PadSettings = {
 */
 exports.cleanText = (txt:string): string => txt.replace(/\r\n/g, '\n')
    .replace(/\r/g, '\n')
-    .replace(/\t/g, '        ')
-    .replace(/\xa0/g, ' ');
+    .replace(/\t/g, '        ');

 class Pad {
  private db: Database;
--- a/src/node/utils/ExportHtml.ts
+++ b/src/node/utils/ExportHtml.ts
@ -537,13 +537,16 @@ const _processSpaces = (s: string) => {
  const doesWrap = true;
  if (s.indexOf('<') < 0 && !doesWrap) {
    // short-cut
-    return s.replace(/ /g, '&nbsp;');
+    return s.replace(/[ \u00a0]/g, '&nbsp;');
  }
-  const parts = [];
-  s.replace(/<[^>]*>?| |[^ <]+/g, (m) => {
+  const parts: string[] = [];
+  s.replace(/<[^>]*>?|[ \u00a0]|[^ \u00a0<]+/g, (m) => {
    parts.push(m);
    return m
  });
+  // U+00A0 is content for run-bookkeeping - it terminates a space run
+  // just like a word character would, so runs of regular spaces adjacent
+  // to a nbsp are not miscounted as one long run (issue #3037).
  if (doesWrap) {
    let endOfLine = true;
    let beforeSpace = false;
@ -578,6 +581,9 @@ const _processSpaces = (s: string) => {
      }
    }
  }
+  for (let i = 0; i < parts.length; i++) {
+    if (parts[i] === '\u00a0') parts[i] = '&nbsp;';
+  }
  return parts.join('');
 };

--- a/src/static/js/ace2_inner.ts
+++ b/src/static/js/ace2_inner.ts
@ -476,8 +476,8 @@ function Ace2Inner(editorInfo, cssManagers) {
      if (text.charAt(text.length - 1) !== '\n') {
        throw new Error('new raw text must end with newline');
      }
-      if (/[\r\t\xa0]/.exec(text)) {
-        throw new Error('new raw text must not contain CR, tab, or nbsp');
+      if (/[\r\t]/.exec(text)) {
+        throw new Error('new raw text must not contain CR or tab');
      }
      lines = text.substring(0, text.length - 1).split('\n');
    } else {
@ -2042,7 +2042,7 @@ function Ace2Inner(editorInfo, cssManagers) {
      (nonEmpty) => domline.createDomLine(nonEmpty, doesWrap, browser, document);

  const textify =
-      (str) => str.replace(/[\n\r ]/g, ' ').replace(/\xa0/g, ' ').replace(/\t/g, '        ');
+      (str) => str.replace(/[\n\r ]/g, ' ').replace(/\t/g, '        ');

  const _blockElems = {
    div: 1,
--- a/src/static/js/contentcollector.ts
+++ b/src/static/js/contentcollector.ts
@ -79,9 +79,31 @@ const makeContentCollector = (collectStyles, abrowser, apool, className2Author)
  const textify = (str) => sanitizeUnicode(
      str.replace(/(\n | \n)/g, ' ')
          .replace(/[\n\r ]/g, ' ')
-          .replace(/\xa0/g, ' ')
          .replace(/\t/g, '        '));

+  // processSpaces (domline.ts, ExportHtml.ts) is a lossy one-way display
+  // transform: leading/trailing spaces and all-but-the-last space in a run
+  // are rendered as &nbsp; to defeat HTML whitespace collapsing, so any
+  // round-trip through the DOM sees nbsps where the model has plain spaces.
+  // To keep the model canonical, a [ \u00a0]+ run read back from the fully
+  // assembled line is collapsed to plain spaces unless it is strictly
+  // interior to non-whitespace chars AND contains only U+00A0 (issue #3037 -
+  // user-intended nbsp between words such as "100 km"). The rule runs on
+  // the whole line (not per DOM text node) so that nbsps sitting at a
+  // span boundary - e.g. <span>100</span><span>&nbsp;km</span> - are still
+  // seen as interior. The transform is length-preserving so attribution
+  // offsets stay consistent.
+  const canonicalizeNbspRuns = (s: string) => s.replace(
+      /[ \u00a0]+/g,
+      (run: string, offset: number, src: string) => {
+        const before = offset > 0 ? src[offset - 1] : '';
+        const after = offset + run.length < src.length
+            ? src[offset + run.length] : '';
+        const pureNbsp = !run.includes(' ');
+        const interiorOfWord = /\S/.test(before) && /\S/.test(after);
+        return pureNbsp && interiorOfWord ? run : ' '.repeat(run.length);
+      });
+
  const getAssoc = (node, name) => node[`_magicdom_${name}`];

  const lines = (() => {
@ -642,6 +664,12 @@ const makeContentCollector = (collectStyles, abrowser, apool, className2Author)
    lineStrings.length--;
    lineAttribs.length--;

+    // Canonicalize display-artifact nbsps on the whole-line string (issue
+    // #3037). Length-preserving, so no attribute adjustments are required.
+    for (let i = 0; i < lineStrings.length; i++) {
+      lineStrings[i] = canonicalizeNbspRuns(lineStrings[i]);
+    }
+
    const ss = getSelectionStart();
    const se = getSelectionEnd();

--- a/src/static/js/domline.ts
+++ b/src/static/js/domline.ts
@ -231,12 +231,15 @@ domline.createDomLine = (nonEmpty, doesWrap, optBrowser, optDocument) => {
 domline.processSpaces = (s, doesWrap) => {
  if (s.indexOf('<') < 0 && !doesWrap) {
    // short-cut
-    return s.replace(/ /g, '&nbsp;');
+    return s.replace(/[ \u00a0]/g, '&nbsp;');
  }
  const parts = [];
-  s.replace(/<[^>]*>?| |[^ <]+/g, (m) => {
+  s.replace(/<[^>]*>?|[ \u00a0]|[^ \u00a0<]+/g, (m) => {
    parts.push(m);
  });
+  // U+00A0 is content for run-bookkeeping - it terminates a space run
+  // just like a word character would, so runs of regular spaces adjacent
+  // to a nbsp are not miscounted as one long run (issue #3037).
  if (doesWrap) {
    let endOfLine = true;
    let beforeSpace = false;
@ -271,6 +274,9 @@ domline.processSpaces = (s, doesWrap) => {
      }
    }
  }
+  for (let i = 0; i < parts.length; i++) {
+    if (parts[i] === '\u00a0') parts[i] = '&nbsp;';
+  }
  return parts.join('');
 };

--- a/src/tests/backend/specs/Pad.ts
+++ b/src/tests/backend/specs/Pad.ts
@ -45,6 +45,9 @@ describe(__filename, function () {
      ['x\ry\n', 'x\ny\n'],
      ['x\r\ny\n', 'x\ny\n'],
      ['x\r\r\ny\n', 'x\n\ny\n'],
+      // Non-breaking space (U+00A0) must survive cleanText (issue #3037).
+      ['100\u00a0km\n', '100\u00a0km\n'],
+      ['a\u00a0\u00a0b\n', 'a\u00a0\u00a0b\n'],
    ];
    for (const [input, want] of testCases) {
      it(`${JSON.stringify(input)} -> ${JSON.stringify(want)}`, async function () {
@ -53,6 +56,22 @@ describe(__filename, function () {
    }
  });

+  describe('non-breaking space preservation (issue #3037)', function () {
+    it('spliceText round-trips U+00A0', async function () {
+      pad = await padManager.getPad(padId, '');
+      // spliceText is an existing runtime Pad method; cast avoids
+      // adding a type-only declaration to PadType in this PR.
+      await (pad as any).spliceText(0, 0, '100\u00a0km');
+      assert.equal(pad!.text(), '100\u00a0km\n');
+    });
+
+    it('setText round-trips U+00A0', async function () {
+      pad = await padManager.getPad(padId, '');
+      await pad!.setText('a\u00a0b\n');
+      assert.equal(pad!.text(), 'a\u00a0b\n');
+    });
+  });
+
  describe('padDefaultContent hook', function () {
    it('runs when a pad is created without specific text', async function () {
      const p = new Promise<void>((resolve) => {
--- a/src/tests/backend/specs/api/importexport.ts
+++ b/src/tests/backend/specs/api/importexport.ts
@ -79,8 +79,8 @@ const testImports:MapArrayType<any> = {
    // XXX the HTML between "than" and "one" looks strange
    description: 'non-breaking space should be preserved, but can be replaced when it',
    input: '<html><body>Text&nbsp;with&nbsp; more&nbsp;&nbsp;&nbsp;than   &nbsp;one space.<br></body></html>',
-    wantHTML: '<!DOCTYPE HTML><html><body>Text with&nbsp; more&nbsp;&nbsp; than&nbsp; one space.<br><br></body></html>',
-    wantText: 'Text with  more   than  one space.\n\n',
+    wantHTML: '<!DOCTYPE HTML><html><body>Text&nbsp;with&nbsp; more&nbsp;&nbsp;&nbsp;than&nbsp; one space.<br><br></body></html>',
+    wantText: 'Text\u00a0with  more\u00a0\u00a0\u00a0than  one space.\n\n',
  },
  'multiplenbsp': {
    description: 'Multiple non-breaking space should be preserved',
@ -91,8 +91,8 @@ const testImports:MapArrayType<any> = {
  'multipleNonBreakingSpaceBetweenWords': {
    description: 'A normal space is always inserted before a word',
    input: '<html><body>&nbsp;&nbsp;word1&nbsp;&nbsp;word2&nbsp;&nbsp;&nbsp;word3<br></body></html>',
-    wantHTML: '<!DOCTYPE HTML><html><body>&nbsp; word1&nbsp; word2&nbsp;&nbsp; word3<br><br></body></html>',
-    wantText: '  word1  word2   word3\n\n',
+    wantHTML: '<!DOCTYPE HTML><html><body>&nbsp; word1&nbsp;&nbsp;word2&nbsp;&nbsp;&nbsp;word3<br><br></body></html>',
+    wantText: '  word1\u00a0\u00a0word2\u00a0\u00a0\u00a0word3\n\n',
  },
  'nonBreakingSpacePreceededBySpaceBetweenWords': {
    description: 'A non-breaking space preceded by a normal space',
--- a/src/tests/backend/specs/contentcollector.ts
+++ b/src/tests/backend/specs/contentcollector.ts
@ -185,7 +185,7 @@ const testCases = [
    description: 'non-breaking and normal space should be preserved',
    html: '<html><body>Text&nbsp;with&nbsp; more&nbsp;&nbsp;&nbsp;than   &nbsp;one space.<br></body></html>',
    wantAlines: ['+10'],
-    wantText: ['Text with  more   than    one space.'],
+    wantText: ['Text\u00a0with  more\u00a0\u00a0\u00a0than    one space.'],
  },
  {
    description: 'Multiple nbsp should be preserved',
@ -197,7 +197,7 @@ const testCases = [
    description: 'Multiple nbsp between words ',
    html: '<html><body>&nbsp;&nbsp;word1&nbsp;&nbsp;word2&nbsp;&nbsp;&nbsp;word3<br></body></html>',
    wantAlines: ['+m'],
-    wantText: ['  word1  word2   word3'],
+    wantText: ['  word1\u00a0\u00a0word2\u00a0\u00a0\u00a0word3'],
  },
  {
    description: 'A non-breaking space preceded by a normal space',
@ -359,6 +359,16 @@ pre
    ],
    wantText: ['before', '*bare item', 'after'],
  },
+  {
+    // Regression for PR #7585 review feedback: a user-intended nbsp
+    // sitting at a DOM text-node boundary (split across spans) must
+    // still be preserved because canonicalization runs on the whole
+    // line, not per text node.
+    description: 'nbsp preserved across span boundary',
+    html: '<html><body><p><span>100</span><span>&nbsp;km</span></p></body></html>',
+    wantAlines: ['+6'],
+    wantText: ['100\u00a0km'],
+  },
 ];

 describe(__filename, function () {