Fix convert v30 with image datasets
This commit is contained in:
parent
71715c3914
commit
253c649507
|
@ -24,8 +24,9 @@ from typing import Any
|
||||||
|
|
||||||
import jsonlines
|
import jsonlines
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import pyarrow as pa
|
||||||
import tqdm
|
import tqdm
|
||||||
from datasets import Dataset
|
from datasets import Dataset, Features, Image
|
||||||
from huggingface_hub import HfApi, snapshot_download
|
from huggingface_hub import HfApi, snapshot_download
|
||||||
from requests import HTTPError
|
from requests import HTTPError
|
||||||
|
|
||||||
|
@ -138,7 +139,7 @@ def convert_tasks(root, new_root):
|
||||||
write_tasks(df_tasks, new_root)
|
write_tasks(df_tasks, new_root)
|
||||||
|
|
||||||
|
|
||||||
def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx):
|
def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys):
|
||||||
# TODO(rcadene): to save RAM use Dataset.from_parquet(file) and concatenate_datasets
|
# TODO(rcadene): to save RAM use Dataset.from_parquet(file) and concatenate_datasets
|
||||||
dataframes = [pd.read_parquet(file) for file in paths_to_cat]
|
dataframes = [pd.read_parquet(file) for file in paths_to_cat]
|
||||||
# Concatenate all DataFrames along rows
|
# Concatenate all DataFrames along rows
|
||||||
|
@ -146,13 +147,25 @@ def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx):
|
||||||
|
|
||||||
path = new_root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
|
path = new_root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
concatenated_df.to_parquet(path, index=False)
|
|
||||||
|
if len(image_keys) > 0:
|
||||||
|
schema = pa.Schema.from_pandas(concatenated_df)
|
||||||
|
features = Features.from_arrow_schema(schema)
|
||||||
|
for key in image_keys:
|
||||||
|
features[key] = Image()
|
||||||
|
schema = features.arrow_schema
|
||||||
|
else:
|
||||||
|
schema = None
|
||||||
|
|
||||||
|
concatenated_df.to_parquet(path, index=False, schema=schema)
|
||||||
|
|
||||||
|
|
||||||
def convert_data(root, new_root):
|
def convert_data(root, new_root):
|
||||||
data_dir = root / "data"
|
data_dir = root / "data"
|
||||||
ep_paths = sorted(data_dir.glob("*/*.parquet"))
|
ep_paths = sorted(data_dir.glob("*/*.parquet"))
|
||||||
|
|
||||||
|
image_keys = get_image_keys(root)
|
||||||
|
|
||||||
ep_idx = 0
|
ep_idx = 0
|
||||||
chunk_idx = 0
|
chunk_idx = 0
|
||||||
file_idx = 0
|
file_idx = 0
|
||||||
|
@ -179,7 +192,7 @@ def convert_data(root, new_root):
|
||||||
paths_to_cat.append(ep_path)
|
paths_to_cat.append(ep_path)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx)
|
concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys)
|
||||||
|
|
||||||
# Reset for the next file
|
# Reset for the next file
|
||||||
size_in_mb = ep_size_in_mb
|
size_in_mb = ep_size_in_mb
|
||||||
|
@ -190,7 +203,7 @@ def convert_data(root, new_root):
|
||||||
|
|
||||||
# Write remaining data if any
|
# Write remaining data if any
|
||||||
if paths_to_cat:
|
if paths_to_cat:
|
||||||
concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx)
|
concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys)
|
||||||
|
|
||||||
return episodes_metadata
|
return episodes_metadata
|
||||||
|
|
||||||
|
@ -202,6 +215,13 @@ def get_video_keys(root):
|
||||||
return video_keys
|
return video_keys
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_keys(root):
|
||||||
|
info = load_info(root)
|
||||||
|
features = info["features"]
|
||||||
|
image_keys = [key for key, ft in features.items() if ft["dtype"] == "image"]
|
||||||
|
return image_keys
|
||||||
|
|
||||||
|
|
||||||
def convert_videos(root: Path, new_root: Path):
|
def convert_videos(root: Path, new_root: Path):
|
||||||
video_keys = get_video_keys(root)
|
video_keys = get_video_keys(root)
|
||||||
if len(video_keys) == 0:
|
if len(video_keys) == 0:
|
||||||
|
|
Loading…
Reference in New Issue