# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Download dataset from: https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json Convert to OpenAI API: export INPUT_FILE=sharegpt_20230401_clean_lang_split.json python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128 """ import argparse import json import random from statistics import mean from typing import Any, Optional import pandas as pd # type: ignore import tqdm # type: ignore from transformers import AutoTokenizer # type: ignore def has_non_english_chars(text: str) -> bool: return not text.isascii() def content_is_valid( content: str, min_content_len: Optional[int], max_content_len: Optional[int] ) -> bool: if min_content_len and len(content) < min_content_len: return False if max_content_len and len(content) > max_content_len: return False return has_non_english_chars(content) def print_stats( conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None ) -> None: # Collect statistics stats = [] print("\nCollecting statistics...") for item in tqdm.tqdm(conversations): # item has "id" and "messages" messages = item["messages"] user_turns = 0 assistant_turns = 0 user_words = 0 assistant_words = 0 conv_chars = 0 user_tokens: list[int] = [] assistant_tokens: list[int] = [] for m in messages: content = m["content"] conv_chars += len(content) content_num_words = content.count(" ") + 1 num_tokens = 0 if tokenizer: num_tokens = len(tokenizer(m["content"]).input_ids) if m["role"] == "user": user_turns += 1 user_words += content_num_words if tokenizer: user_tokens.append(num_tokens) elif m["role"] == "assistant": assistant_turns += 1 assistant_words += content_num_words if tokenizer: assistant_tokens.append(num_tokens) # assert user_turns == assistant_turns, \ # f"Invalid conversation ID {item['id']}" conv_words = user_words + assistant_words item_stats = { "user_turns": user_turns, "assistant_turns": assistant_turns, "user_words": user_words, "assistant_words": assistant_words, "conv_turns": len(messages), "conv_words": conv_words, "conv_characters": conv_chars, } if len(user_tokens) > 0: item_stats["user_tokens"] = int(mean(user_tokens)) if len(assistant_tokens) > 0: item_stats["assistant_tokens"] = int(mean(assistant_tokens)) stats.append(item_stats) print("\nStatistics:") percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999] df = pd.DataFrame(stats) print(df.describe(percentiles=percentiles).transpose()) def convert_sharegpt_to_openai( seed: int, input_file: str, output_file: str, max_items: Optional[int], min_content_len: Optional[int] = None, max_content_len: Optional[int] = None, min_turns: Optional[int] = None, max_turns: Optional[int] = None, model: Optional[str] = None, ) -> None: if min_turns and max_turns: assert min_turns <= max_turns if min_content_len and max_content_len: # Verify that min is not larger than max if both were given assert min_content_len <= max_content_len print( f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=}," f" {max_content_len=}, {min_turns=}, {max_turns=}\n" ) random.seed(seed) tokenizer = None if model is not None: print(f"Loading tokenizer from: {model}") tokenizer = AutoTokenizer.from_pretrained(model) # Read the ShareGPT JSON file print(f"Reading file: {input_file}") with open(input_file, encoding="utf-8") as f: # Should be a list of dicts # Each dict should have "id" (string) and "conversations" (list of dicts) sharegpt_data = json.load(f) assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts" print(f"Total items in input file: {len(sharegpt_data):,}") print(f"Shuffling dataset with seed {seed}") random.shuffle(sharegpt_data) # Map conversation ID to the all the messages conversation_parts: dict[str, list[Any]] = {} for item in tqdm.tqdm(sharegpt_data): assert "id" in item, "Missing key 'id'" assert "conversations" in item, "Missing key 'conversations'" # Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.) conv_id, _ = item["id"].split("_") new_turns = item["conversations"] if conv_id not in conversation_parts: # Start new conversation conversation_parts[conv_id] = [] elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0: prev_turns = conversation_parts[conv_id][-1] if prev_turns[-1]["from"] == new_turns[0]["from"]: new_turns = new_turns[1:] if len(new_turns) > 0: # We assume that parts are in order in the ShareGPT dataset conversation_parts[conv_id].append(new_turns) dataset: list[dict[str, Any]] = [] for conv_id, conv_parts in conversation_parts.items(): new_item = {"id": conv_id} conversations: list[dict[str, str]] = [] # Merge all parts for conv_part in conv_parts: conversations.extend(conv_part) if len(conversations) > 0: new_item["conversations"] = conversations dataset.append(new_item) print(f"Total unique conversations (IDs) in input file: {len(dataset):,}") # Final output data final_openai_dataset: list[dict] = [] # Filter conversations from the ShareGPT dataset and convert to OpenAI format for item in tqdm.tqdm(dataset): messages: list[dict] = [] assert "id" in item, "Missing key 'id'" assert "conversations" in item, "Missing key 'conversations'" conv_id = item["id"] conversations = item["conversations"] if min_turns is not None and len(conversations) < min_turns: # Skip short conversations continue # Convert each message in the conversation, up to max_turns if specified for i, turn in enumerate(conversations): assert "from" in turn and "value" in turn, ( f"Invalid conversation ID {conv_id} - missing 'from' or 'value'" ) role = None turn_from = turn["from"] if turn_from in {"human", "user"}: role = "user" elif turn_from in {"gpt", "bing", "chatgpt", "bard"}: role = "assistant" elif turn_from == "system": role = "system" assert role is not None, ( f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid" ) if i == 0 and role != "user": # If the first message is from assistant (gpt), skip it. # this happens when the conversation is a follow-up # to a previous conversation (from the same user). continue if max_turns is not None and i >= max_turns: break # Convert message to OpenAI format (with "role" and "content") content = turn["value"] messages.append({"role": role, "content": content}) # Add the converted conversation to the OpenAI format if len(messages) > 0: valid_messages = True # First turn should always be from the user user_turn = True for m in messages: # Make sure that turns alternate between user and assistant if (user_turn and m["role"] != "user") or ( not user_turn and m["role"] != "assistant" ): valid_messages = False break user_turn = not user_turn content = m["content"] valid_messages = content_is_valid( content, min_content_len, max_content_len ) if not valid_messages: break if valid_messages is True: final_openai_dataset.append({"id": conv_id, "messages": messages}) assert len(final_openai_dataset) > 0, "Final number of conversations is zero" print_stats(final_openai_dataset) print_stats_again = False if max_items is not None and len(final_openai_dataset) > max_items: print(f"\n\nSampling {max_items} items from the dataset...") print_stats_again = True final_openai_dataset = random.sample(final_openai_dataset, max_items) if print_stats_again: # Print stats after the dataset changed print_stats(final_openai_dataset, tokenizer) # Write the converted data to a new JSON file final_size = len(final_openai_dataset) print(f"\nTotal conversations converted (after filtering): {final_size:,}") print(f"\nWriting file: {output_file}") with open(output_file, "w", encoding="utf-8") as f: json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2) def main() -> None: parser = argparse.ArgumentParser( description="Convert ShareGPT dataset to OpenAI API format" ) parser.add_argument("input_file", help="Path to the input ShareGPT JSON file") parser.add_argument( "output_file", help="Path to the output OpenAI format JSON file" ) parser.add_argument( "--seed", type=int, default=0, help="Seed for random number generators" ) parser.add_argument( "--max-items", type=int, default=None, help="Maximum number of items in the output file", ) parser.add_argument( "--min-turns", type=int, default=None, help="Minimum number of turns per conversation", ) parser.add_argument( "--max-turns", type=int, default=None, help="Maximum number of turns per conversation", ) parser.add_argument( "--min-content-len", type=int, default=None, help="Min number of characters in the messages' content", ) parser.add_argument( "--max-content-len", type=int, default=None, help="Max number of characters in the messages' content", ) parser.add_argument( "--model", type=str, default=None, help="LLM model, only the tokenizer will be used", ) args = parser.parse_args() convert_sharegpt_to_openai( args.seed, args.input_file, args.output_file, args.max_items, args.min_content_len, args.max_content_len, args.min_turns, args.max_turns, args.model, ) if __name__ == "__main__": main()