Strip hidden Unicode used for AI watermarking / fingerprinting
Aa TextClean hidden characters from AI-generated text
$ echo "Hello, world! This text has hidden characters." | strip-invisible (input, opts = {}) => {
// Invisible chars: zero-width, bidi controls, variation selectors, tag chars, etc.
const STRIP =
/[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u2069\u206A-\u206F\u2800\u3164\uFE00-\uFE0F\uFEFF\uFFF9-\uFFFB\uFFA0\u{E0000}-\u{E007F}\u{E0100}-\u{E01EF}\u{1D173}-\u{1D17A}\u{1BCA0}-\u{1BCA3}]/gu;
// Space-like chars to normalize to ASCII space
const NORM = /[\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]/g;
if (opts.mode === "audit") {
const names = {
0x00ad: "SOFT HYPHEN",
0x034f: "COMBINING GRAPHEME JOINER",
0x061c: "ARABIC LETTER MARK",
0x180e: "MONGOLIAN VOWEL SEPARATOR",
0x200b: "ZERO WIDTH SPACE",
0x200c: "ZERO WIDTH NON-JOINER",
0x200d: "ZERO WIDTH JOINER",
0x200e: "LEFT-TO-RIGHT MARK",
0x200f: "RIGHT-TO-LEFT MARK",
0x2060: "WORD JOINER",
0x2061: "FUNCTION APPLICATION",
0x2062: "INVISIBLE TIMES",
0x2063: "INVISIBLE SEPARATOR",
0x2064: "INVISIBLE PLUS",
0xfeff: "ZERO WIDTH NO-BREAK SPACE",
0x2800: "BRAILLE PATTERN BLANK",
0x3164: "HANGUL FILLER",
0x00a0: "NO-BREAK SPACE",
0x1680: "OGHAM SPACE MARK",
0x2000: "EN QUAD",
0x2001: "EM QUAD",
0x2002: "EN SPACE",
0x2003: "EM SPACE",
0x2004: "THREE-PER-EM SPACE",
0x2005: "FOUR-PER-EM SPACE",
0x2006: "SIX-PER-EM SPACE",
0x2007: "FIGURE SPACE",
0x2008: "PUNCTUATION SPACE",
0x2009: "THIN SPACE",
0x200a: "HAIR SPACE",
0x202f: "NARROW NO-BREAK SPACE",
0x205f: "MEDIUM MATHEMATICAL SPACE",
0x3000: "IDEOGRAPHIC SPACE",
};
const name = (cp) => {
if (names[cp]) return names[cp];
if (cp >= 0xfe00 && cp <= 0xfe0f)
return `VARIATION SELECTOR-${cp - 0xfdff}`;
if (cp >= 0xe0100 && cp <= 0xe01ef)
return `VARIATION SELECTOR-${cp - 0xe00e9}`;
if (cp >= 0xe0020 && cp <= 0xe007e) {
const ch = String.fromCodePoint(cp - 0xe0000);
return ch.trim() ? `TAG '${ch}'` : "TAG SPACE";
}
if (cp >= 0xe0000 && cp <= 0xe007f) return "TAG CHARACTER";
if (cp >= 0x202a && cp <= 0x202e) return "BIDI CONTROL";
if (cp >= 0x2066 && cp <= 0x2069) return "BIDI ISOLATE";
if (cp >= 0x206a && cp <= 0x206f) return "DEPRECATED FORMAT CHAR";
return "INVISIBLE";
};
const ALL =
/[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u2069\u206A-\u206F\u2800\u3164\uFE00-\uFE0F\uFEFF\uFFF9-\uFFFB\uFFA0\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000\u{E0000}-\u{E007F}\u{E0100}-\u{E01EF}\u{1D173}-\u{1D17A}\u{1BCA0}-\u{1BCA3}]/gu;
const spaceSet = new Set([
0x00a0,
0x1680,
0x202f,
0x205f,
0x3000,
...Array.from({ length: 11 }, (_, i) => 0x2000 + i),
]);
const found = new Map();
for (const m of input.matchAll(ALL)) {
const cp = m[0].codePointAt(0);
found.set(cp, (found.get(cp) || 0) + 1);
}
if (found.size === 0) return "No hidden characters found.";
const lines = [...found.entries()]
.sort((a, b) => b[1] - a[1])
.map(([cp, n]) => {
const hex = `U+${cp.toString(16).toUpperCase().padStart(4, "0")}`;
const act = spaceSet.has(cp) ? "\u2192 space" : "strip";
return `${hex} ${name(cp).padEnd(28)} \u00d7${String(n).padStart(3)} (${act})`;
});
const total = [...found.values()].reduce((a, b) => a + b, 0);
lines.push(
"",
`${total} hidden char${total !== 1 ? "s" : ""} in ${found.size} type${found.size !== 1 ? "s" : ""}`,
);
return lines.join("\n");
}
return input.replace(NORM, " ").replace(STRIP, "");
}