Skip to content

Commit

Permalink
better detection of charset utf-8 in html: (#219)
Browse files Browse the repository at this point in the history
- detect if <meta charset='utf-8'> is set and switch parsing to utf-8,
if not already
- parse first 512 of html buffer first in case <meta charset> is present
in the beginning, so that subsequent parsing may use the correct
encoding
- fixes #218
  • Loading branch information
ikreymer authored Jan 1, 2025
1 parent b4fbd6b commit 1259d09
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 5 deletions.
8 changes: 7 additions & 1 deletion src/response.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
MAX_STREAM_CHUNK_SIZE,
tsToDate,
getStatusText,
INITIAL_STREAM_CHUNK_SIZE,
} from "./utils";
import { Buffer } from "buffer";

Expand Down Expand Up @@ -251,7 +252,12 @@ class ArchiveResponse {

async function* iter() {
if (buffer) {
for (let i = 0; i < buffer.length; i += MAX_STREAM_CHUNK_SIZE) {
let i = 0;

yield buffer.slice(0, i + INITIAL_STREAM_CHUNK_SIZE);
i += INITIAL_STREAM_CHUNK_SIZE;

for (i; i < buffer.length; i += MAX_STREAM_CHUNK_SIZE) {
yield buffer.slice(i, i + MAX_STREAM_CHUNK_SIZE);
}
} else if (reader) {
Expand Down
16 changes: 12 additions & 4 deletions src/rewrite/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,10 @@ class HTMLRewriter {
attr.name = "_" + attr.name;
} else if (tagName === "meta" && name === "content") {
attr.value = this.rewriteMetaContent(tag.attrs, attr, rewriter);
} else if (tagName === "meta" && name === "charset") {
if (value && ["utf8", "utf-8"].includes(value.toLowerCase())) {
this.isCharsetUTF8 = true;
}
} else if (tagName === "param" && isUrl(value)) {
attr.value = this.rewriteUrl(rewriter, attr.value);
} else if (name.startsWith("data-") && isUrl(value)) {
Expand Down Expand Up @@ -469,16 +473,20 @@ class HTMLRewriter {
const sourceGen = response.createIter();
let hasData = false;

const isCharsetUTF8 = this.isCharsetUTF8;
// eslint-disable-next-line @typescript-eslint/no-this-alias
const htmlrewriter = this;

response.setReader(
new ReadableStream({
async start(controller) {
rwStream.on("data", (text) => {
controller.enqueue(
// [TODO]
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
isCharsetUTF8 ? encoder.encode(text) : encodeLatin1(text),
htmlrewriter.isCharsetUTF8
? // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
encoder.encode(text)
: // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
encodeLatin1(text),
);
});

Expand All @@ -487,7 +495,7 @@ class HTMLRewriter {
});

for await (const chunk of sourceGen) {
if (isCharsetUTF8) {
if (htmlrewriter.isCharsetUTF8) {
rwStream.write(decoder.decode(chunk), "utf8");
} else {
rwStream.write(decodeLatin1(chunk), "latin1");
Expand Down
1 change: 1 addition & 0 deletions src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export const PAGE_STATE_NEED_REMOTE_SYNC = 0x10;
export const PAGE_STATE_NEED_LOCAL_SYNC = 0x01;
export const PAGE_STATE_SYNCED = 0x11;

export const INITIAL_STREAM_CHUNK_SIZE = 512;
export const MAX_STREAM_CHUNK_SIZE = 65536 * 4;

export const REPLAY_TOP_FRAME_NAME = "___wb_replay_top_frame";
Expand Down

0 comments on commit 1259d09

Please sign in to comment.