pipr.tools

strip-invisible

Strip hidden Unicode used for AI watermarking / fingerprinting

Aa Text

Try it

stdin0 chars
stdout0 chars

Flags

--mode select: clean | audit default: clean

Example

Clean hidden characters from AI-generated text

Usage
$ echo "Hello​, ‌world‍! This text has hidden⁠ characters." | strip-invisible
View source
(input, opts = {}) => {
      // Invisible chars: zero-width, bidi controls, variation selectors, tag chars, etc.
      const STRIP =
        /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u2069\u206A-\u206F\u2800\u3164\uFE00-\uFE0F\uFEFF\uFFF9-\uFFFB\uFFA0\u{E0000}-\u{E007F}\u{E0100}-\u{E01EF}\u{1D173}-\u{1D17A}\u{1BCA0}-\u{1BCA3}]/gu;
      // Space-like chars to normalize to ASCII space
      const NORM = /[\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]/g;

      if (opts.mode === "audit") {
        const names = {
          0x00ad: "SOFT HYPHEN",
          0x034f: "COMBINING GRAPHEME JOINER",
          0x061c: "ARABIC LETTER MARK",
          0x180e: "MONGOLIAN VOWEL SEPARATOR",
          0x200b: "ZERO WIDTH SPACE",
          0x200c: "ZERO WIDTH NON-JOINER",
          0x200d: "ZERO WIDTH JOINER",
          0x200e: "LEFT-TO-RIGHT MARK",
          0x200f: "RIGHT-TO-LEFT MARK",
          0x2060: "WORD JOINER",
          0x2061: "FUNCTION APPLICATION",
          0x2062: "INVISIBLE TIMES",
          0x2063: "INVISIBLE SEPARATOR",
          0x2064: "INVISIBLE PLUS",
          0xfeff: "ZERO WIDTH NO-BREAK SPACE",
          0x2800: "BRAILLE PATTERN BLANK",
          0x3164: "HANGUL FILLER",
          0x00a0: "NO-BREAK SPACE",
          0x1680: "OGHAM SPACE MARK",
          0x2000: "EN QUAD",
          0x2001: "EM QUAD",
          0x2002: "EN SPACE",
          0x2003: "EM SPACE",
          0x2004: "THREE-PER-EM SPACE",
          0x2005: "FOUR-PER-EM SPACE",
          0x2006: "SIX-PER-EM SPACE",
          0x2007: "FIGURE SPACE",
          0x2008: "PUNCTUATION SPACE",
          0x2009: "THIN SPACE",
          0x200a: "HAIR SPACE",
          0x202f: "NARROW NO-BREAK SPACE",
          0x205f: "MEDIUM MATHEMATICAL SPACE",
          0x3000: "IDEOGRAPHIC SPACE",
        };
        const name = (cp) => {
          if (names[cp]) return names[cp];
          if (cp >= 0xfe00 && cp <= 0xfe0f)
            return `VARIATION SELECTOR-${cp - 0xfdff}`;
          if (cp >= 0xe0100 && cp <= 0xe01ef)
            return `VARIATION SELECTOR-${cp - 0xe00e9}`;
          if (cp >= 0xe0020 && cp <= 0xe007e) {
            const ch = String.fromCodePoint(cp - 0xe0000);
            return ch.trim() ? `TAG '${ch}'` : "TAG SPACE";
          }
          if (cp >= 0xe0000 && cp <= 0xe007f) return "TAG CHARACTER";
          if (cp >= 0x202a && cp <= 0x202e) return "BIDI CONTROL";
          if (cp >= 0x2066 && cp <= 0x2069) return "BIDI ISOLATE";
          if (cp >= 0x206a && cp <= 0x206f) return "DEPRECATED FORMAT CHAR";
          return "INVISIBLE";
        };
        const ALL =
          /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u2069\u206A-\u206F\u2800\u3164\uFE00-\uFE0F\uFEFF\uFFF9-\uFFFB\uFFA0\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000\u{E0000}-\u{E007F}\u{E0100}-\u{E01EF}\u{1D173}-\u{1D17A}\u{1BCA0}-\u{1BCA3}]/gu;
        const spaceSet = new Set([
          0x00a0,
          0x1680,
          0x202f,
          0x205f,
          0x3000,
          ...Array.from({ length: 11 }, (_, i) => 0x2000 + i),
        ]);
        const found = new Map();
        for (const m of input.matchAll(ALL)) {
          const cp = m[0].codePointAt(0);
          found.set(cp, (found.get(cp) || 0) + 1);
        }
        if (found.size === 0) return "No hidden characters found.";
        const lines = [...found.entries()]
          .sort((a, b) => b[1] - a[1])
          .map(([cp, n]) => {
            const hex = `U+${cp.toString(16).toUpperCase().padStart(4, "0")}`;
            const act = spaceSet.has(cp) ? "\u2192 space" : "strip";
            return `${hex}  ${name(cp).padEnd(28)} \u00d7${String(n).padStart(3)}  (${act})`;
          });
        const total = [...found.values()].reduce((a, b) => a + b, 0);
        lines.push(
          "",
          `${total} hidden char${total !== 1 ? "s" : ""} in ${found.size} type${found.size !== 1 ? "s" : ""}`,
        );
        return lines.join("\n");
      }

      return input.replace(NORM, " ").replace(STRIP, "");
    }

Suggested Pipelines

Related Tools