diff --git a/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/README.md b/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/README.md index af7de6df..93e66efb 100644 --- a/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/README.md +++ b/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/README.md @@ -29,3 +29,4 @@ | [text-to-python.ipynb](./text-to-code/text-to-python.ipynb) | Text-to-Code | Generate Python code from natural language instructions with validation and evaluation | | [text-to-python-evol.ipynb](./text-to-code/text-to-python-evol.ipynb) | Text-to-Code | Build advanced Python code generation with evolutionary improvements and iterative refinement | | [text-to-sql.ipynb](./text-to-code/text-to-sql.ipynb) | Text-to-Code | Create SQL queries from natural language descriptions with validation and testing | +| [japanese_commonsense_qa_data_generator_nemotron_persona_jp_seed.ipynb](./nemotron-persona-jp/japanese_commonsense_qa_data_generator_nemotron_persona_jp_seed.ipynb) | Text-to-Code | Create SQL queries from natural language descriptions with validation and testing | diff --git a/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/nemotron-persona-jp/japanese_commonsense_qa_data_generator_nemotron_persona_jp_seed.ipynb b/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/nemotron-persona-jp/japanese_commonsense_qa_data_generator_nemotron_persona_jp_seed.ipynb index a9c6bc71..1a9b2758 100644 --- a/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/nemotron-persona-jp/japanese_commonsense_qa_data_generator_nemotron_persona_jp_seed.ipynb +++ b/nemo/NeMo-Data-Designer/self-hosted-tutorials/community-contributions/nemotron-persona-jp/japanese_commonsense_qa_data_generator_nemotron_persona_jp_seed.ipynb @@ -5,7 +5,7 @@ "id": "header", "metadata": {}, "source": [ - "# ๐จ NeMo Data Designer: Japanese Commonsense Reasoning Dataset Generation (Improved Version)\n", + "# ๐จ NeMo Data Designer: Japanese Commonsense Reasoning Dataset Generation\n", "\n", "#### ๐ Overview\n", "\n", @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 3, "id": "model_config_code", "metadata": {}, "outputs": [], @@ -96,7 +96,7 @@ "MODEL_PROVIDER = \"nvidiabuild\"\n", "MODEL_ID = \"openai/gpt-oss-120b\"\n", "MODEL_ALIAS = \"gpt-oss-120b\"\n", - "SYSTEM_PROMPT = \"\" \n", + "SYSTEM_PROMPT = \"\"\n", "JUDGE_MODEL_ALIAS = \"quality-judge\"\n", "\n", "model_configs = [\n", @@ -1300,7 +1300,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 20, "id": "config_with_seed_code", "metadata": {}, "outputs": [], @@ -1314,7 +1314,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 21, "id": "be7698ab-4dac-4491-aad2-ba2ad67df724", "metadata": {}, "outputs": [ @@ -1322,13 +1322,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "[11:35:31] [INFO] ๐ Uploading seed dataset to datastore\n" + "[15:03:51] [INFO] ๐ Uploading seed dataset to datastore\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2f5dd6f00f0846a88b1597b4b81d14f7", + "model_id": "f151f7e25f84424eaa890f36d335fff5", "version_major": 2, "version_minor": 0 }, @@ -1350,7 +1350,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 22, "id": "f72b780b-1838-4288-b7a2-f96a8276d21c", "metadata": {}, "outputs": [ @@ -1574,7 +1574,7 @@ ")" ] }, - "execution_count": 73, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1598,7 +1598,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 23, "id": "jcqa_generation_code", "metadata": {}, "outputs": [ @@ -1669,7 +1669,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 24, "id": "quality_metrics_code", "metadata": {}, "outputs": [ @@ -1761,7 +1761,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 25, "id": "preview_with_seed", "metadata": { "scrolled": true @@ -1771,10 +1771,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "[11:35:59] [INFO] โ Validation passed\n", - "[11:35:59] [INFO] ๐ Starting preview generation\n", - "[11:35:59] [INFO] โ๏ธ Sorting column configs into a Directed Acyclic Graph\n", - "[11:35:59] [INFO] ๐ฉบ Running health checks for models...\n" + "[15:03:56] [INFO] โ Validation passed\n", + "[15:03:56] [INFO] ๐ Starting preview generation\n", + "[15:03:56] [INFO] โ๏ธ Sorting column configs into a Directed Acyclic Graph\n", + "[15:03:56] [INFO] ๐ฉบ Running health checks for models...\n" ] }, { @@ -1791,17 +1791,17 @@ "name": "stderr", "output_type": "stream", "text": [ - "[11:36:00] [INFO] |-- ๐ Checking 'openai/gpt-oss-120b' in provider named 'nvidiabuild' for model alias 'quality-judge'...\n", - "[11:36:00] [INFO] |-- โ Passed!\n", - "[11:36:00] [INFO] |-- ๐ Checking 'openai/gpt-oss-120b' in provider named 'nvidiabuild' for model alias 'gpt-oss-120b'...\n", - "[11:36:00] [INFO] |-- โ Passed!\n", - "[11:36:01] [INFO] โณ Processing batch 1 of 1\n", - "[11:36:01] [INFO] ๐ฑ Sampling 1 records from seed dataset\n", - "[11:36:01] [INFO] |-- seed dataset size: 2000 records\n", - "[11:36:01] [INFO] |-- sampling strategy: ordered\n", - "[11:36:01] [INFO] ๐๏ธ Preparing llm-structured column generation\n", - "[11:36:01] [INFO] |-- column name: 'jcqa_data'\n", - "[11:36:01] [INFO] |-- model config:\n", + "[15:03:57] [INFO] |-- ๐ Checking 'openai/gpt-oss-120b' in provider named 'nvidiabuild' for model alias 'gpt-oss-120b'...\n", + "[15:03:57] [INFO] |-- โ Passed!\n", + "[15:03:58] [INFO] |-- ๐ Checking 'openai/gpt-oss-120b' in provider named 'nvidiabuild' for model alias 'quality-judge'...\n", + "[15:03:58] [INFO] |-- โ Passed!\n", + "[15:03:58] [INFO] โณ Processing batch 1 of 1\n", + "[15:03:58] [INFO] ๐ฑ Sampling 1 records from seed dataset\n", + "[15:03:58] [INFO] |-- seed dataset size: 2000 records\n", + "[15:03:58] [INFO] |-- sampling strategy: ordered\n", + "[15:03:58] [INFO] ๐๏ธ Preparing llm-structured column generation\n", + "[15:03:58] [INFO] |-- column name: 'jcqa_data'\n", + "[15:03:58] [INFO] |-- model config:\n", "{\n", " \"alias\": \"gpt-oss-120b\",\n", " \"model\": \"openai/gpt-oss-120b\",\n", @@ -1815,10 +1815,10 @@ " },\n", " \"provider\": \"nvidiabuild\"\n", "}\n", - "[11:36:06] [INFO] ๐ Processing llm-structured column 'jcqa_data' with 8 concurrent workers\n", - "[11:36:09] [INFO] โ๏ธ Preparing llm-judge column generation\n", - "[11:36:09] [INFO] |-- column name: 'quality_metrics'\n", - "[11:36:09] [INFO] |-- model config:\n", + "[15:04:03] [INFO] ๐ Processing llm-structured column 'jcqa_data' with 8 concurrent workers\n", + "[15:04:05] [INFO] โ๏ธ Preparing llm-judge column generation\n", + "[15:04:05] [INFO] |-- column name: 'quality_metrics'\n", + "[15:04:05] [INFO] |-- model config:\n", "{\n", " \"alias\": \"quality-judge\",\n", " \"model\": \"openai/gpt-oss-120b\",\n", @@ -1832,76 +1832,76 @@ " },\n", " \"provider\": \"nvidiabuild\"\n", "}\n", - "[11:36:13] [INFO] ๐ Processing llm-judge column 'quality_metrics' with 4 concurrent workers\n", - "[11:36:13] [INFO] ๐งฉ Generating column `clarity_score` from expression\n", - "[11:36:13] [INFO] ๐งฉ Generating column `difficulty` from expression\n", - "[11:36:13] [INFO] ๐ Model usage summary:\n", + "[15:04:10] [INFO] ๐ Processing llm-judge column 'quality_metrics' with 4 concurrent workers\n", + "[15:04:10] [INFO] ๐งฉ Generating column `clarity_score` from expression\n", + "[15:04:10] [INFO] ๐งฉ Generating column `difficulty` from expression\n", + "[15:04:10] [INFO] ๐ Model usage summary:\n", "{\n", " \"openai/gpt-oss-120b\": {\n", " \"token_usage\": {\n", - " \"prompt_tokens\": 1216,\n", - " \"completion_tokens\": 934,\n", - " \"total_tokens\": 2150\n", + " \"prompt_tokens\": 1653,\n", + " \"completion_tokens\": 563,\n", + " \"total_tokens\": 2216\n", " },\n", " \"request_usage\": {\n", " \"successful_requests\": 1,\n", " \"failed_requests\": 0,\n", " \"total_requests\": 1\n", " },\n", - " \"tokens_per_second\": 168,\n", - " \"requests_per_minute\": 4\n", + " \"tokens_per_second\": 187,\n", + " \"requests_per_minute\": 5\n", " }\n", "}\n", - "[11:36:13] [INFO] ๐ Measuring dataset column statistics:\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'uuid'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'professional_persona'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'sports_persona'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'arts_persona'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'travel_persona'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'culinary_persona'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'persona'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'cultural_background'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'skills_and_expertise'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'skills_and_expertise_list'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'hobbies_and_interests'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'hobbies_and_interests_list'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'career_goals_and_ambitions'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'sex'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'age'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'marital_status'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'education_level'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'occupation'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'region'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'area'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'prefecture'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'country'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'age_band'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: '_all_text'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: '_core_text'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: '_core_len'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: '_attr_key'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'score_finance'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'score_safety'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'score_vocab'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'score_public'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'score_tools'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'score_life'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'score_geo'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'score_culture'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: '_geo_text'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: '_tools_text'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: '_kw_hits'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'jc_category'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'max_score_any'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: '_public_bonus'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: '_religion_pen'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'jc_theme'\n", - "[11:36:13] [INFO] |-- ๐ฑ column: 'topic_category'\n", - "[11:36:13] [INFO] |-- ๐๏ธ column: 'jcqa_data'\n", - "[11:36:13] [INFO] |-- โ๏ธ column: 'quality_metrics'\n", - "[11:36:13] [INFO] |-- ๐งฉ column: 'clarity_score'\n", - "[11:36:13] [INFO] |-- ๐งฉ column: 'difficulty'\n", - "[11:36:13] [INFO] ๐ Preview complete!\n" + "[15:04:10] [INFO] ๐ Measuring dataset column statistics:\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'uuid'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'professional_persona'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'sports_persona'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'arts_persona'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'travel_persona'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'culinary_persona'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'persona'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'cultural_background'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'skills_and_expertise'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'skills_and_expertise_list'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'hobbies_and_interests'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'hobbies_and_interests_list'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'career_goals_and_ambitions'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'sex'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'age'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'marital_status'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'education_level'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'occupation'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'region'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'area'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'prefecture'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'country'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'age_band'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: '_all_text'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: '_core_text'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: '_core_len'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: '_attr_key'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'score_finance'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'score_safety'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'score_vocab'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'score_public'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'score_tools'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'score_life'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'score_geo'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'score_culture'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: '_geo_text'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: '_tools_text'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: '_kw_hits'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'jc_category'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'max_score_any'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: '_public_bonus'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: '_religion_pen'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'jc_theme'\n", + "[15:04:10] [INFO] |-- ๐ฑ column: 'topic_category'\n", + "[15:04:10] [INFO] |-- ๐๏ธ column: 'jcqa_data'\n", + "[15:04:10] [INFO] |-- โ๏ธ column: 'quality_metrics'\n", + "[15:04:10] [INFO] |-- ๐งฉ column: 'clarity_score'\n", + "[15:04:10] [INFO] |-- ๐งฉ column: 'difficulty'\n", + "[15:04:10] [INFO] โ Preview complete!\n" ] }, { @@ -2043,19 +2043,19 @@ "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", "โ clarity_score โ ๆ็ขบ โ\n", "โโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ difficulty โ ๆฎ้ โ\n", + "โ difficulty โ ๆใใ โ\n", "โโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", "โ jcqa_data โ { โ\n", "โ โ 'answer_index': 0, โ\n", - "โ โ 'choice0': '็พ้๏ผ็ดๅนฃใป็กฌ่ฒจ๏ผ', โ\n", - "โ โ 'choice1': 'ใฏใฌใธใใใซใผใ', โ\n", - "โ โ 'choice2': '้ปๅญใใใผ๏ผSuicaใปICOCA ใชใฉ๏ผ', โ\n", - "โ โ 'choice3': 'ๅฐๅๆ', โ\n", - "โ โ 'choice4': 'ในใใผใใใฉใณๆฑบๆธ๏ผPayPay ใชใฉ๏ผ', โ\n", + "โ โ 'choice0': '็พ้ใงๆฏๆใ', โ\n", + "โ โ 'choice1': 'ใฏใฌใธใใใซใผใใงๆฏๆใ', โ\n", + "โ โ 'choice2': 'ใขใใคใซSuicaใงๆฏๆใ', โ\n", + "โ โ 'choice3': 'ไบค้็ณปICใซใผใใฎใใฃใผใธใ่กใ', โ\n", + "โ โ 'choice4': '้ปๅญใใใผ๏ผๆฅฝๅคฉใใค๏ผใงๆฏๆใ', โ\n", "โ โ 'question': โ\n", - "โ โ 'ไธ้็ใฎๅ ฌๅ ฑๅณๆธ้คจใงๆฌใฎ่ฟๅดๆ้ใ้ใใ้ใฎๅปถๆป้ใๆฏๆใใจใใๅไปใงไธ่ฌ็ใซๅใไปใใฆใใ โฆ โ\n", + "โ โ 'ไธ้็ไผๅข้ง ใฎ็ชๅฃใงใICใซใผใใฎๆฎ้ซใไธ่ถณใใฆใใใใไน่ปๅธใ่ณผๅ ฅใใใใจใใฆใใใ็ชๅฃใงใฎ โฆ โ\n", "โ โ 'reasoning': โ\n", - "โ โ 'ๆฅๆฌใฎๅคใใฎๅ ฌๅ ฑๅณๆธ้คจใงใฏใๅปถๆป้ใฎๆฏๆใใฏ็ชๅฃใงใฎ็พ้ใฎใฟใๆจๆบ็ใซๅใไปใใใใฆใใพใใ โฆ โ\n", + "โ โ '้ง ใฎ็ชๅฃใงไน่ปๅธใ่ณผๅ ฅใใ้ใฏใๆใไธ่ฌ็ใใค็ขบๅฎใซๅใไปใใใใๆฏๆใๆนๆณใฏ็พ้ใงใใใใฏ โฆ โ\n", "โ โ } โ\n", "โโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", " \n", @@ -2064,9 +2064,9 @@ "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", "โ difficulty โ question_clarity โ\n", "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ score: ๆฎ้ โ score: ๆ็ขบ โ\n", + "โ score: ๆใใ โ score: ๆ็ขบ โ\n", "โ reasoning: โ reasoning: โ\n", - "โ ใใฎๅ้กใฏๆฅๆฌใฎๅ ฌๅ ฑๅณๆธ้คจใฎไธ่ฌ็ใช้ๅถๅฎๅใซ้ขใใ โฆ โ ่ณชๅใฏใไธ้็ใฎๅ ฌๅ ฑๅณๆธ้คจใงๆฌใฎ่ฟๅดๆ้ใ้ใใ้ใฎ โฆ โ\n", + "โ ๆฅๆฌใฎ้ง ็ชๅฃใงใฎๆฏๆใๆนๆณใฏๅบใ็ฅใใใๅธธ่ญใงใใใ โฆ โ ่ณชๅใฏๅ ทไฝ็ใชใทใใฅใจใผใทใงใณ๏ผไผๅข้ง ใฎ็ชๅฃใงICใซใผ โฆ โ\n", "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", " \n", " [index: 0] \n", @@ -2201,19 +2201,19 @@ "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", "โ clarity_score โ ๆ็ขบ โ\n", "โโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ difficulty โ ๆฎ้ โ\n", + "โ difficulty โ ๆใใ โ\n", "โโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", "โ jcqa_data โ \u001b[1m{\u001b[0m โ\n", "โ โ \u001b[32m'answer_index'\u001b[0m: \u001b[1;36m0\u001b[0m, โ\n", - "โ โ \u001b[32m'choice0'\u001b[0m: \u001b[32m'็พ้๏ผ็ดๅนฃใป็กฌ่ฒจ๏ผ'\u001b[0m, โ\n", - "โ โ \u001b[32m'choice1'\u001b[0m: \u001b[32m'ใฏใฌใธใใใซใผใ'\u001b[0m, โ\n", - "โ โ \u001b[32m'choice2'\u001b[0m: \u001b[32m'้ปๅญใใใผ๏ผSuicaใปICOCA ใชใฉ๏ผ'\u001b[0m, โ\n", - "โ โ \u001b[32m'choice3'\u001b[0m: \u001b[32m'ๅฐๅๆ'\u001b[0m, โ\n", - "โ โ \u001b[32m'choice4'\u001b[0m: \u001b[32m'ในใใผใใใฉใณๆฑบๆธ๏ผPayPay ใชใฉ๏ผ'\u001b[0m, โ\n", + "โ โ \u001b[32m'choice0'\u001b[0m: \u001b[32m'็พ้ใงๆฏๆใ'\u001b[0m, โ\n", + "โ โ \u001b[32m'choice1'\u001b[0m: \u001b[32m'ใฏใฌใธใใใซใผใใงๆฏๆใ'\u001b[0m, โ\n", + "โ โ \u001b[32m'choice2'\u001b[0m: \u001b[32m'ใขใใคใซSuicaใงๆฏๆใ'\u001b[0m, โ\n", + "โ โ \u001b[32m'choice3'\u001b[0m: \u001b[32m'ไบค้็ณปICใซใผใใฎใใฃใผใธใ่กใ'\u001b[0m, โ\n", + "โ โ \u001b[32m'choice4'\u001b[0m: \u001b[32m'้ปๅญใใใผ๏ผๆฅฝๅคฉใใค๏ผใงๆฏๆใ'\u001b[0m, โ\n", "โ โ \u001b[32m'question'\u001b[0m: โ\n", - "โ โ \u001b[32m'ไธ้็ใฎๅ ฌๅ ฑๅณๆธ้คจใงๆฌใฎ่ฟๅดๆ้ใ้ใใ้ใฎๅปถๆป้ใๆฏๆใใจใใๅไปใงไธ่ฌ็ใซๅใไปใใฆใใ โฆ\u001b[0m โ\n", + "โ โ \u001b[32m'ไธ้็ไผๅข้ง ใฎ็ชๅฃใงใICใซใผใใฎๆฎ้ซใไธ่ถณใใฆใใใใไน่ปๅธใ่ณผๅ ฅใใใใจใใฆใใใ็ชๅฃใงใฎ โฆ\u001b[0m โ\n", "โ โ \u001b[32m'reasoning'\u001b[0m: โ\n", - "โ โ \u001b[32m'ๆฅๆฌใฎๅคใใฎๅ ฌๅ ฑๅณๆธ้คจใงใฏใๅปถๆป้ใฎๆฏๆใใฏ็ชๅฃใงใฎ็พ้ใฎใฟใๆจๆบ็ใซๅใไปใใใใฆใใพใใ โฆ\u001b[0m โ\n", + "โ โ \u001b[32m'้ง ใฎ็ชๅฃใงไน่ปๅธใ่ณผๅ ฅใใ้ใฏใๆใไธ่ฌ็ใใค็ขบๅฎใซๅใไปใใใใๆฏๆใๆนๆณใฏ็พ้ใงใใใใฏ โฆ\u001b[0m โ\n", "โ โ \u001b[1m}\u001b[0m โ\n", "โโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", " \n", @@ -2222,9 +2222,9 @@ "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", "โ\u001b[1m \u001b[0m\u001b[1mdifficulty \u001b[0m\u001b[1m \u001b[0mโ\u001b[1m \u001b[0m\u001b[1mquestion_clarity \u001b[0m\u001b[1m \u001b[0mโ\n", "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ score: ๆฎ้ โ score: ๆ็ขบ โ\n", + "โ score: ๆใใ โ score: ๆ็ขบ โ\n", "โ reasoning: โ reasoning: โ\n", - "โ ใใฎๅ้กใฏๆฅๆฌใฎๅ ฌๅ ฑๅณๆธ้คจใฎไธ่ฌ็ใช้ๅถๅฎๅใซ้ขใใ โฆ โ ่ณชๅใฏใไธ้็ใฎๅ ฌๅ ฑๅณๆธ้คจใงๆฌใฎ่ฟๅดๆ้ใ้ใใ้ใฎ โฆ โ\n", + "โ ๆฅๆฌใฎ้ง ็ชๅฃใงใฎๆฏๆใๆนๆณใฏๅบใ็ฅใใใๅธธ่ญใงใใใ โฆ โ ่ณชๅใฏๅ ทไฝ็ใชใทใใฅใจใผใทใงใณ๏ผไผๅข้ง ใฎ็ชๅฃใงICใซใผ โฆ โ\n", "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", " \n", " [index: 0] \n" @@ -2251,7 +2251,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 26, "id": "preview_analysis", "metadata": { "scrolled": true @@ -2378,7 +2378,7 @@ "โ โ โ โ prompt tokens โ completion tokens โ\n", "โ column name โ data type โ number unique values โ per record โ per record โ\n", "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ jcqa_data โ dict โ 1 (100.0%) โ 1319.0 +/- 0.0 โ 396.0 +/- nan โ\n", + "โ jcqa_data โ dict โ 1 (100.0%) โ 1319.0 +/- 0.0 โ 361.0 +/- nan โ\n", "โโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโ\n", " \n", " \n", @@ -2387,7 +2387,7 @@ "โ โ โ โ prompt tokens โ completion tokens โ\n", "โ column name โ data type โ number unique values โ per record โ per record โ\n", "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ quality_metrics โ dict โ 1 (100.0%) โ 1746.0 +/- 0.0 โ 272.0 +/- nan โ\n", + "โ quality_metrics โ dict โ 1 (100.0%) โ 1714.0 +/- 0.0 โ 282.0 +/- nan โ\n", "โโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโ\n", " \n", " \n", @@ -2522,7 +2522,7 @@ "โ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m prompt tokens\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m completion tokens\u001b[0m\u001b[1;2m \u001b[0mโ\n", "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\n", "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ jcqa_data โ dict โ 1 (100.0%) โ 1319.0 +/- 0.0 โ 396.0 +/- nan โ\n", + "โ jcqa_data โ dict โ 1 (100.0%) โ 1319.0 +/- 0.0 โ 361.0 +/- nan โ\n", "โโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโ\n", " \n", " \n", @@ -2531,7 +2531,7 @@ "โ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m prompt tokens\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m completion tokens\u001b[0m\u001b[1;2m \u001b[0mโ\n", "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\n", "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ quality_metrics โ dict โ 1 (100.0%) โ 1746.0 +/- 0.0 โ 272.0 +/- nan โ\n", + "โ quality_metrics โ dict โ 1 (100.0%) โ 1714.0 +/- 0.0 โ 282.0 +/- nan โ\n", "โโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโ\n", " \n", " \n", @@ -2567,7 +2567,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 27, "id": "preview_dataset", "metadata": { "scrolled": true @@ -2613,16 +2613,16 @@ "0 None C_ๆฏๆใใปใ้ ๅ ฌๅ ฑใฎๅ ด \n", "\n", " jcqa_data \\\n", - "0 {'answer_index': 0, 'choice0': '็พ้๏ผ็ดๅนฃใป็กฌ่ฒจ๏ผ', 'c... \n", + "0 {'answer_index': 0, 'choice0': '็พ้ใงๆฏๆใ', 'choi... \n", "\n", " jcqa_data__reasoning_trace \\\n", "0 We need to output JSON with fields: question, ... \n", "\n", " quality_metrics \\\n", - "0 {'difficulty': {'reasoning': 'ใใฎๅ้กใฏๆฅๆฌใฎๅ ฌๅ ฑๅณๆธ้คจใฎไธ่ฌ... \n", + "0 {'difficulty': {'reasoning': 'ๆฅๆฌใฎ้ง ็ชๅฃใงใฎๆฏๆใๆนๆณใฏๅบใ... \n", "\n", " quality_metrics__reasoning_trace clarity_score difficulty \n", - "0 We need to evaluate the generated data's quali... ๆ็ขบ ๆฎ้ \n", + "0 We need to evaluate the generated data's quali... ๆ็ขบ ๆใใ \n", "\n", "[1 rows x 50 columns]\n" ] @@ -2681,314 +2681,10 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "analysis_with_seed", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "================================================================================\n", - "Seedใใผใฟใใ็ใฎๅๆ\n", - "================================================================================\n" - ] - }, - { - "data": { - "text/html": [ - "
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ ๐จ Data Designer Dataset Profile โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "\n", - " \n", - " Dataset Overview \n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "โ number of records โ number of columns โ percent complete records โ\n", - "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ 7,992 โ 48 โ 99.9% โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - " \n", - " \n", - " ๐ฑ Seed-Dataset Columns \n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "โ column name โ data type โ number unique values โ\n", - "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ uuid โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ professional_persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ sports_persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ arts_persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ travel_persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ culinary_persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ cultural_background โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ skills_and_expertise โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ skills_and_expertise_list โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ hobbies_and_interests โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ hobbies_and_interests_list โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ career_goals_and_ambitions โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ sex โ string โ 2 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ age โ int โ 82 (1.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ marital_status โ string โ 8 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ education_level โ string โ 10 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ occupation โ string โ 615 (7.7%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ region โ string โ 8 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ area โ string โ 2 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ prefecture โ string โ 47 (0.6%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ country โ string โ 1 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ age_band โ None โ 0 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _all_text โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _core_text โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _core_len โ int โ 130 (1.6%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _attr_key โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_finance โ int โ 4 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_safety โ int โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_vocab โ int โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_public โ int โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_tools โ int โ 2 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_life โ int โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_geo โ int โ 7 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_culture โ int โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _geo_text โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _tools_text โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _kw_hits โ int โ 7 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ jc_category โ string โ 8 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ max_score_any โ float โ 6 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _public_bonus โ None โ 0 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _religion_pen โ None โ 0 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ jc_theme โ string โ 6 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ topic_category โ string โ 7 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - " \n", - " \n", - " ๐๏ธ LLM-Structured Columns \n", - "โโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "โ โ โ โ prompt tokens โ completion tokens โ\n", - "โ column name โ data type โ number unique values โ per record โ per record โ\n", - "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ jcqa_data โ dict โ 7992 (100.0%) โ 1322.0 +/- 7.7 โ 318.0 +/- 64.9 โ\n", - "โโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - " \n", - " \n", - " โ๏ธ LLM-Judge Columns \n", - "โโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "โ โ โ โ prompt tokens โ completion tokens โ\n", - "โ column name โ data type โ number unique values โ per record โ per record โ\n", - "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ quality_metrics โ dict โ 7992 (100.0%) โ 1667.0 +/- 66.5 โ 281.0 +/- 42.2 โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโ\n", - " \n", - " \n", - " ๐งฉ Expression Columns \n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "โ column name โ data type โ number unique values โ\n", - "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ clarity_score โ string โ 2 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ difficulty โ string โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - " \n", - " \n", - "โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ Table Notes โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ\n", - "โ โ\n", - "โ 1. All token statistics are based on a sample of max(1000, len(dataset)) records. โ\n", - "โ 2. Tokens are calculated using tiktoken's cl100k_base tokenizer. โ\n", - "โ โ\n", - "โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ\n", - " \n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "\n" - ], - "text/plain": [ - "\u001b[1;38;2;118;185;0mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ \u001b[0m๐จ Data Designer Dataset Profile\u001b[1;38;2;118;185;0m โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\n", - "\n", - " \n", - "\u001b[1;38;2;118;185;0m Dataset Overview \u001b[0m\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "โ\u001b[1;2m \u001b[0m\u001b[1;2mnumber of records \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2mnumber of columns \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2mpercent complete records \u001b[0m\u001b[1;2m \u001b[0mโ\n", - "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ 7,992 โ 48 โ 99.9% โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - " \n", - " \n", - "\u001b[1;38;2;118;185;0m ๐ฑ Seed-Dataset Columns \u001b[0m\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\n", - "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ uuid โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ professional_persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ sports_persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ arts_persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ travel_persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ culinary_persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ persona โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ cultural_background โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ skills_and_expertise โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ skills_and_expertise_list โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ hobbies_and_interests โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ hobbies_and_interests_list โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ career_goals_and_ambitions โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ sex โ string โ 2 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ age โ int โ 82 (1.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ marital_status โ string โ 8 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ education_level โ string โ 10 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ occupation โ string โ 615 (7.7%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ region โ string โ 8 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ area โ string โ 2 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ prefecture โ string โ 47 (0.6%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ country โ string โ 1 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ age_band โ \u001b[3;35mNone\u001b[0m โ 0 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _all_text โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _core_text โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _core_len โ int โ 130 (1.6%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _attr_key โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_finance โ int โ 4 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_safety โ int โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_vocab โ int โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_public โ int โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_tools โ int โ 2 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_life โ int โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_geo โ int โ 7 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ score_culture โ int โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _geo_text โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _tools_text โ string โ 2000 (25.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _kw_hits โ int โ 7 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ jc_category โ string โ 8 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ max_score_any โ float โ 6 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _public_bonus โ \u001b[3;35mNone\u001b[0m โ 0 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ _religion_pen โ \u001b[3;35mNone\u001b[0m โ 0 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ jc_theme โ string โ 6 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ topic_category โ string โ 7 (0.1%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - " \n", - " \n", - "\u001b[1;38;2;118;185;0m ๐๏ธ LLM-Structured Columns \u001b[0m\n", - "โโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "โ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m prompt tokens\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m completion tokens\u001b[0m\u001b[1;2m \u001b[0mโ\n", - "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\n", - "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ jcqa_data โ dict โ 7992 (100.0%) โ 1322.0 +/- 7.7 โ 318.0 +/- 64.9 โ\n", - "โโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - " \n", - " \n", - "\u001b[1;38;2;118;185;0m โ๏ธ LLM-Judge Columns \u001b[0m\n", - "โโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "โ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m prompt tokens\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m completion tokens\u001b[0m\u001b[1;2m \u001b[0mโ\n", - "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m per record\u001b[0m\u001b[1;2m \u001b[0mโ\n", - "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ quality_metrics โ dict โ 7992 (100.0%) โ 1667.0 +/- 66.5 โ 281.0 +/- 42.2 โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโ\n", - " \n", - " \n", - "\u001b[1;38;2;118;185;0m ๐งฉ Expression Columns \u001b[0m\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - "โ\u001b[1;2m \u001b[0m\u001b[1;2mcolumn name \u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m data type\u001b[0m\u001b[1;2m \u001b[0mโ\u001b[1;2m \u001b[0m\u001b[1;2m number unique values\u001b[0m\u001b[1;2m \u001b[0mโ\n", - "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n", - "โ clarity_score โ string โ 2 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค\n", - "โ difficulty โ string โ 3 (0.0%) โ\n", - "โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n", - " \n", - " \n", - "\u001b[2mโญโ\u001b[0m\u001b[2mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\u001b[2m Table Notes \u001b[0m\u001b[2mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\u001b[2mโโฎ\u001b[0m\n", - "\u001b[2mโ\u001b[0m \u001b[2mโ\u001b[0m\n", - "\u001b[2mโ\u001b[0m 1. All token statistics are based on a sample of max(1000, len(dataset)) records. \u001b[2mโ\u001b[0m\n", - "\u001b[2mโ\u001b[0m 2. Tokens are calculated using tiktoken's cl100k_base tokenizer. \u001b[2mโ\u001b[0m\n", - "\u001b[2mโ\u001b[0m \u001b[2mโ\u001b[0m\n", - "\u001b[2mโฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ\u001b[0m\n", - " \n", - "\u001b[1;38;2;118;185;0mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Seedใใ็ใฎๅๆ\n", "print(\"\\n\" + \"=\"*80)\n", @@ -3014,30 +2710,7 @@ "execution_count": null, "id": "269b2e97-4244-462b-824a-21a231eaac96", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Seedใใ็ใฎใใผใฟๆฐ: 7992\n", - "\n", - "Seedใใ็ใฎๅ่ณชในใณใขๅๅธ:\n", - "\n", - "clarity_score:\n", - "clarity_score\n", - "ๆ็ขบ 7974\n", - "ใใไธๆ็ขบ 18\n", - "Name: count, dtype: int64[pyarrow]\n", - "\n", - "difficulty:\n", - "difficulty\n", - "ๆใใ 6984\n", - "ๆฎ้ 1007\n", - "้ฃใใ 1\n", - "Name: count, dtype: int64[pyarrow]\n" - ] - } - ], + "outputs": [], "source": [ "# ใใผใฟใฎ่ชญใฟ่พผใฟ\n", "df_with_seed = job_with_seed.load_dataset()\n", @@ -3073,30 +2746,10 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "4d1023ea-dd88-4301-85ed-40b60e481024", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[17:52:08] [INFO] ๐บ Downloading artifacts from Job with ID 'job-2qxnztmlrfrkdkjzuxv2kd'\n", - "[17:52:10] [INFO] โ Artifacts downloaded to jcommonsenseqa_data_output_filter_jcommonsenseqa_seed_adjust_2000_temperature_0_9_remake_metric_8000_blog_check_en/with_seed_data\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "ใใผใฟใ 'jcommonsenseqa_data_output_filter_jcommonsenseqa_seed_adjust_2000_temperature_0_9_remake_metric_8000_blog_check_en' ใใฃใฌใฏใใชใซไฟๅญใใพใใใ\n", - "\n", - "ไฟๅญใใใใใกใคใซ:\n", - " - with_seed_data.jsonl (103554.34 KB)\n" - ] - } - ], + "outputs": [], "source": [ "OUTPUT_DIR = \"jcommonsenseqa_8000_filter_jcommonsenseqa_seed_2000_temperature_0_9\"\n", "import os\n", @@ -3138,11 +2791,19 @@ "### What We Did\n", "1. โ Correctly configured nvidia/Nemotron-Personas-Japan as seed data\n", "2. โ Generated data by directly referencing seed data columns\n", - "3. โ Generated synthetic data for jcommonsenseqa and commonsensemoralja\n", - "4. โ Created 2 versions: with and without seed data\n", + "3. โ Generated synthetic data for jcommonsenseqa\n", + "4. โ Created with seed data\n", "5. โ Quality evaluation using LLM-as-a-Judge\n", "6. โ Generated quality comparison report\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e1564da-84a0-45d7-9424-6e08faca3ca1", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {