From d995577f46308bf6b1d5ec4da25db05843289b2a Mon Sep 17 00:00:00 2001 From: paxflsu4r <198028451@qq.com> Date: Mon, 20 Jan 2025 16:54:14 +0800 Subject: [PATCH] ADD file via upload --- jsonl_to_index.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 jsonl_to_index.py diff --git a/jsonl_to_index.py b/jsonl_to_index.py new file mode 100644 index 0000000..0bb3c4f --- /dev/null +++ b/jsonl_to_index.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright @2023 AI, ZHIHU Inc. (zhihu.com) +# +# @author: ouzebin +# @date: 2023/08/07 +import argparse +import os + + +def build_index(path): + data_path = os.path.join(path, "data.jsonl") + assert os.path.exists(data_path), f"Jsonline dataset '{data_path}' not found." + + offset = 0 + starts = [offset] + with open(data_path, "rb") as fin: + for line in fin: + offset += len(line) + starts.append(offset) + with open(os.path.join(path, "index"), "w") as fout: + for s in starts: + fout.write(f"{s}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--path", "-p", required=True, help="Data path.") + args = parser.parse_args() + build_index(args.path)