fix(editor): preserve U+00A0 non-breaking space (#3037) (#7585)

* fix(editor): preserve U+00A0 non-breaking space (#3037)

Non-breaking spaces were silently normalized to regular spaces at every
ingestion point, so typed/pasted/imported nbsps never reached the
changeset and users could not glue words against line-wrap in French or
other languages that require nbsp typography.

Removed the four strip sites that replaced U+00A0 with U+0020:
  - src/node/db/Pad.ts cleanText
  - src/static/js/contentcollector.ts textify
  - src/static/js/ace2_inner.ts textify
  - src/static/js/ace2_inner.ts importText raw-text guard

Updated both processSpaces functions (domline and ExportHtml) to tokenize
U+00A0 as a separate unit, emit it verbatim as  , and treat it as
content (not whitespace) for the run-collapse bookkeeping so adjacent
regular-space runs aren't miscounted.

Added backend round-trip tests for spliceText and setText, and extended
the cleanText case table. Updated the existing contentcollector and
importexport specs whose expectations encoded the previous buggy
behavior; they now assert genuine nbsp preservation.

Verified manually in Firefox: clipboard U+00A0 → paste → pad → getText
returns c2 a0; getHTML emits `100 km`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* fix(contentcollector): collapse display-artifact nbsp runs on DOM read-back

processSpaces is a lossy one-way display transform: leading/trailing
spaces and all-but-the-last of a run get rendered as &nbsp; so HTML
doesn't collapse them. When incorporateUserChanges reads text back from
the DOM, those display-artifact nbsps were being stored in the changeset
model instead of being normalized back to plain spaces.

This broke handleReturnIndentation, whose /^ *(?:)/ regex only matches
ASCII spaces: auto-indent after `foo:\n` produced 4 spaces instead of
the expected prev-indent (2) + THE_TAB (4) = 6, because the previous
line's model had nbsps where it used to have spaces.

Fix: in contentcollector.textify, collapse any [  ]+ run back to
plain spaces UNLESS the run is pure U+00A0 AND strictly interior to
word chars. That preserves user-intended typographic nbsps like
"100 km" while undoing the one-way display transform.

Updated 7 contentcollector tests and 7 importexport tests whose
assertions needed to reflect the new rule (boundary/mixed runs collapse;
pure-interior nbsp runs preserve).

Fixes the Playwright regression in indentation.spec.ts:117 that the
previous commit introduced.

* fix(contentcollector): canonicalize nbsp runs at line assembly, not per text node

Addresses Qodo code review feedback on PR #7585.

## Bug fix — nbsp lost at DOM text-node boundary

The previous approach ran the "collapse display-artifact nbsp" rule inside
textify(), which is called per individual DOM TEXT_NODE. A user-intended
nbsp sitting at a text-node boundary (e.g., <span>100</span><span>&nbsp;km
</span>) was incorrectly seen as non-interior (before === '' for the second
text node) and normalized back to a regular space.

Fix: move the canonicalization out of textify() and run it on each
fully assembled line string inside cc.finish(). The rule remains:

    [  ]+ run  ->  plain spaces
                   UNLESS pure U+00A0 AND strictly interior to non-ws chars

It is length-preserving, so attribute offsets and line lengths are
unaffected.

Added a regression test (contentcollector.spec.ts) for the cross-span
case.

## Docs concern

Reverted the type-only addition of spliceText to PadType. spliceText
is an existing Pad runtime method; the backend test now uses a cast
(`(pad as any).spliceText`) so the PR does not expand the declared
public type surface, avoiding a separate documentation requirement.

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
John McLear 2026-04-23 08:57:30 +01:00 committed by GitHub
parent de5feb2eb5
commit 67e542d2b9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 85 additions and 17 deletions

View File

@ -51,8 +51,7 @@ type PadSettings = {
*/
exports.cleanText = (txt:string): string => txt.replace(/\r\n/g, '\n')
.replace(/\r/g, '\n')
.replace(/\t/g, ' ')
.replace(/\xa0/g, ' ');
.replace(/\t/g, ' ');
class Pad {
private db: Database;

View File

@ -537,13 +537,16 @@ const _processSpaces = (s: string) => {
const doesWrap = true;
if (s.indexOf('<') < 0 && !doesWrap) {
// short-cut
return s.replace(/ /g, '&nbsp;');
return s.replace(/[ \u00a0]/g, '&nbsp;');
}
const parts = [];
s.replace(/<[^>]*>?| |[^ <]+/g, (m) => {
const parts: string[] = [];
s.replace(/<[^>]*>?|[ \u00a0]|[^ \u00a0<]+/g, (m) => {
parts.push(m);
return m
});
// U+00A0 is content for run-bookkeeping - it terminates a space run
// just like a word character would, so runs of regular spaces adjacent
// to a nbsp are not miscounted as one long run (issue #3037).
if (doesWrap) {
let endOfLine = true;
let beforeSpace = false;
@ -578,6 +581,9 @@ const _processSpaces = (s: string) => {
}
}
}
for (let i = 0; i < parts.length; i++) {
if (parts[i] === '\u00a0') parts[i] = '&nbsp;';
}
return parts.join('');
};

View File

@ -476,8 +476,8 @@ function Ace2Inner(editorInfo, cssManagers) {
if (text.charAt(text.length - 1) !== '\n') {
throw new Error('new raw text must end with newline');
}
if (/[\r\t\xa0]/.exec(text)) {
throw new Error('new raw text must not contain CR, tab, or nbsp');
if (/[\r\t]/.exec(text)) {
throw new Error('new raw text must not contain CR or tab');
}
lines = text.substring(0, text.length - 1).split('\n');
} else {
@ -2042,7 +2042,7 @@ function Ace2Inner(editorInfo, cssManagers) {
(nonEmpty) => domline.createDomLine(nonEmpty, doesWrap, browser, document);
const textify =
(str) => str.replace(/[\n\r ]/g, ' ').replace(/\xa0/g, ' ').replace(/\t/g, ' ');
(str) => str.replace(/[\n\r ]/g, ' ').replace(/\t/g, ' ');
const _blockElems = {
div: 1,

View File

@ -79,9 +79,31 @@ const makeContentCollector = (collectStyles, abrowser, apool, className2Author)
const textify = (str) => sanitizeUnicode(
str.replace(/(\n | \n)/g, ' ')
.replace(/[\n\r ]/g, ' ')
.replace(/\xa0/g, ' ')
.replace(/\t/g, ' '));
// processSpaces (domline.ts, ExportHtml.ts) is a lossy one-way display
// transform: leading/trailing spaces and all-but-the-last space in a run
// are rendered as &nbsp; to defeat HTML whitespace collapsing, so any
// round-trip through the DOM sees nbsps where the model has plain spaces.
// To keep the model canonical, a [ \u00a0]+ run read back from the fully
// assembled line is collapsed to plain spaces unless it is strictly
// interior to non-whitespace chars AND contains only U+00A0 (issue #3037 -
// user-intended nbsp between words such as "100 km"). The rule runs on
// the whole line (not per DOM text node) so that nbsps sitting at a
// span boundary - e.g. <span>100</span><span>&nbsp;km</span> - are still
// seen as interior. The transform is length-preserving so attribution
// offsets stay consistent.
const canonicalizeNbspRuns = (s: string) => s.replace(
/[ \u00a0]+/g,
(run: string, offset: number, src: string) => {
const before = offset > 0 ? src[offset - 1] : '';
const after = offset + run.length < src.length
? src[offset + run.length] : '';
const pureNbsp = !run.includes(' ');
const interiorOfWord = /\S/.test(before) && /\S/.test(after);
return pureNbsp && interiorOfWord ? run : ' '.repeat(run.length);
});
const getAssoc = (node, name) => node[`_magicdom_${name}`];
const lines = (() => {
@ -642,6 +664,12 @@ const makeContentCollector = (collectStyles, abrowser, apool, className2Author)
lineStrings.length--;
lineAttribs.length--;
// Canonicalize display-artifact nbsps on the whole-line string (issue
// #3037). Length-preserving, so no attribute adjustments are required.
for (let i = 0; i < lineStrings.length; i++) {
lineStrings[i] = canonicalizeNbspRuns(lineStrings[i]);
}
const ss = getSelectionStart();
const se = getSelectionEnd();

View File

@ -231,12 +231,15 @@ domline.createDomLine = (nonEmpty, doesWrap, optBrowser, optDocument) => {
domline.processSpaces = (s, doesWrap) => {
if (s.indexOf('<') < 0 && !doesWrap) {
// short-cut
return s.replace(/ /g, '&nbsp;');
return s.replace(/[ \u00a0]/g, '&nbsp;');
}
const parts = [];
s.replace(/<[^>]*>?| |[^ <]+/g, (m) => {
s.replace(/<[^>]*>?|[ \u00a0]|[^ \u00a0<]+/g, (m) => {
parts.push(m);
});
// U+00A0 is content for run-bookkeeping - it terminates a space run
// just like a word character would, so runs of regular spaces adjacent
// to a nbsp are not miscounted as one long run (issue #3037).
if (doesWrap) {
let endOfLine = true;
let beforeSpace = false;
@ -271,6 +274,9 @@ domline.processSpaces = (s, doesWrap) => {
}
}
}
for (let i = 0; i < parts.length; i++) {
if (parts[i] === '\u00a0') parts[i] = '&nbsp;';
}
return parts.join('');
};

View File

@ -45,6 +45,9 @@ describe(__filename, function () {
['x\ry\n', 'x\ny\n'],
['x\r\ny\n', 'x\ny\n'],
['x\r\r\ny\n', 'x\n\ny\n'],
// Non-breaking space (U+00A0) must survive cleanText (issue #3037).
['100\u00a0km\n', '100\u00a0km\n'],
['a\u00a0\u00a0b\n', 'a\u00a0\u00a0b\n'],
];
for (const [input, want] of testCases) {
it(`${JSON.stringify(input)} -> ${JSON.stringify(want)}`, async function () {
@ -53,6 +56,22 @@ describe(__filename, function () {
}
});
describe('non-breaking space preservation (issue #3037)', function () {
it('spliceText round-trips U+00A0', async function () {
pad = await padManager.getPad(padId, '');
// spliceText is an existing runtime Pad method; cast avoids
// adding a type-only declaration to PadType in this PR.
await (pad as any).spliceText(0, 0, '100\u00a0km');
assert.equal(pad!.text(), '100\u00a0km\n');
});
it('setText round-trips U+00A0', async function () {
pad = await padManager.getPad(padId, '');
await pad!.setText('a\u00a0b\n');
assert.equal(pad!.text(), 'a\u00a0b\n');
});
});
describe('padDefaultContent hook', function () {
it('runs when a pad is created without specific text', async function () {
const p = new Promise<void>((resolve) => {

View File

@ -79,8 +79,8 @@ const testImports:MapArrayType<any> = {
// XXX the HTML between "than" and "one" looks strange
description: 'non-breaking space should be preserved, but can be replaced when it',
input: '<html><body>Text&nbsp;with&nbsp; more&nbsp;&nbsp;&nbsp;than &nbsp;one space.<br></body></html>',
wantHTML: '<!DOCTYPE HTML><html><body>Text with&nbsp; more&nbsp;&nbsp; than&nbsp; one space.<br><br></body></html>',
wantText: 'Text with more than one space.\n\n',
wantHTML: '<!DOCTYPE HTML><html><body>Text&nbsp;with&nbsp; more&nbsp;&nbsp;&nbsp;than&nbsp; one space.<br><br></body></html>',
wantText: 'Text\u00a0with more\u00a0\u00a0\u00a0than one space.\n\n',
},
'multiplenbsp': {
description: 'Multiple non-breaking space should be preserved',
@ -91,8 +91,8 @@ const testImports:MapArrayType<any> = {
'multipleNonBreakingSpaceBetweenWords': {
description: 'A normal space is always inserted before a word',
input: '<html><body>&nbsp;&nbsp;word1&nbsp;&nbsp;word2&nbsp;&nbsp;&nbsp;word3<br></body></html>',
wantHTML: '<!DOCTYPE HTML><html><body>&nbsp; word1&nbsp; word2&nbsp;&nbsp; word3<br><br></body></html>',
wantText: ' word1 word2 word3\n\n',
wantHTML: '<!DOCTYPE HTML><html><body>&nbsp; word1&nbsp;&nbsp;word2&nbsp;&nbsp;&nbsp;word3<br><br></body></html>',
wantText: ' word1\u00a0\u00a0word2\u00a0\u00a0\u00a0word3\n\n',
},
'nonBreakingSpacePreceededBySpaceBetweenWords': {
description: 'A non-breaking space preceded by a normal space',

View File

@ -185,7 +185,7 @@ const testCases = [
description: 'non-breaking and normal space should be preserved',
html: '<html><body>Text&nbsp;with&nbsp; more&nbsp;&nbsp;&nbsp;than &nbsp;one space.<br></body></html>',
wantAlines: ['+10'],
wantText: ['Text with more than one space.'],
wantText: ['Text\u00a0with more\u00a0\u00a0\u00a0than one space.'],
},
{
description: 'Multiple nbsp should be preserved',
@ -197,7 +197,7 @@ const testCases = [
description: 'Multiple nbsp between words ',
html: '<html><body>&nbsp;&nbsp;word1&nbsp;&nbsp;word2&nbsp;&nbsp;&nbsp;word3<br></body></html>',
wantAlines: ['+m'],
wantText: [' word1 word2 word3'],
wantText: [' word1\u00a0\u00a0word2\u00a0\u00a0\u00a0word3'],
},
{
description: 'A non-breaking space preceded by a normal space',
@ -359,6 +359,16 @@ pre
],
wantText: ['before', '*bare item', 'after'],
},
{
// Regression for PR #7585 review feedback: a user-intended nbsp
// sitting at a DOM text-node boundary (split across spans) must
// still be preserved because canonicalization runs on the whole
// line, not per text node.
description: 'nbsp preserved across span boundary',
html: '<html><body><p><span>100</span><span>&nbsp;km</span></p></body></html>',
wantAlines: ['+6'],
wantText: ['100\u00a0km'],
},
];
describe(__filename, function () {