| | import os |
| | import warnings |
| | from io import BytesIO |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | import requests |
| | from PIL import Image |
| | from sklearn.model_selection import train_test_split |
| |
|
| | |
| | warnings.filterwarnings("ignore") |
| |
|
| |
|
| | def process_embeddings(df, col_name): |
| | """ |
| | Process embeddings in a DataFrame column. |
| | |
| | Args: |
| | - df (pd.DataFrame): The DataFrame containing the embeddings column. |
| | - col_name (str): The name of the column containing the embeddings. |
| | |
| | Returns: |
| | pd.DataFrame: The DataFrame with processed embeddings. |
| | |
| | Steps: |
| | 1. Convert the values in the specified column to lists. |
| | 2. Extract values from lists and create new columns for each element. |
| | 3. Remove the original embeddings column. |
| | |
| | Example: |
| | df_processed = process_embeddings(df, 'embeddings') |
| | """ |
| | |
| | df[col_name] = df[col_name].apply(eval) |
| |
|
| | |
| | """ 🔎 Example |
| | text_1 text_2 text_3 |
| | 0 -0.123 0.456 0.789 |
| | 1 0.321 -0.654 0.987 |
| | """ |
| | embeddings_df = pd.DataFrame( |
| | df[col_name].to_list(), |
| | columns=[f"text_{i + 1}" for i in range(df[col_name].str.len().max())], |
| | ) |
| | df = pd.concat([df, embeddings_df], axis=1) |
| |
|
| | |
| | df = df.drop(columns=[col_name]) |
| |
|
| | return df |
| |
|
| |
|
| | def rename_image_embeddings(df): |
| | """ |
| | Rename columns in a DataFrame for image embeddings. |
| | |
| | Args: |
| | - df (pd.DataFrame): The DataFrame containing columns to be renamed. |
| | |
| | Returns: |
| | pd.DataFrame: The DataFrame with renamed columns. |
| | |
| | Example: |
| | df_renamed = rename_image_embeddings(df) |
| | """ |
| | |
| | df.columns = [f"image_{int(col)}" if col.isdigit() else col for col in df.columns] |
| |
|
| | return df |
| |
|
| |
|
| | def preprocess_data( |
| | text_data, |
| | image_data, |
| | text_id="image_id", |
| | image_id="ImageName", |
| | embeddings_col="embeddings", |
| | ): |
| | """ |
| | Preprocess and merge text and image dataframes. |
| | |
| | Args: |
| | - text_data (pd.DataFrame): DataFrame containing text data. |
| | - image_data (pd.DataFrame): DataFrame containing image data. |
| | - text_id (str): Column name for text data identifier. |
| | - image_id (str): Column name for image data identifier. |
| | - embeddings_col (str): Column name for embeddings data. |
| | |
| | Returns: |
| | pd.DataFrame: Merged and preprocessed DataFrame. |
| | |
| | This function: |
| | Process text and image embeddings. |
| | Convert image_id and text_id values to integers. |
| | Merge dataframes using id. |
| | Drop unnecessary columns. |
| | |
| | Example: |
| | merged_df = preprocess_data(text_df, image_df) |
| | """ |
| | |
| | text_data = process_embeddings(text_data, embeddings_col) |
| | image_data = rename_image_embeddings(image_data) |
| |
|
| | |
| | image_data = image_data.dropna(subset=[image_id]) |
| | text_data = text_data.dropna(subset=[text_id]) |
| |
|
| | |
| | text_data[text_id] = text_data[text_id].apply(lambda x: x.split("/")[-1]) |
| |
|
| | |
| | df = pd.merge(text_data, image_data, left_on=text_id, right_on=image_id) |
| |
|
| | |
| | df.drop([image_id, text_id], axis=1, inplace=True) |
| |
|
| | return df |
| |
|
| |
|
| | class ImageDownloader: |
| | """ |
| | Image downloader class to download images from URLs. |
| | |
| | Args: |
| | - image_dir (str): Directory to save images. |
| | - image_size (tuple): Size of the images to be saved. |
| | - override (bool): Whether to override existing images. |
| | |
| | Methods: |
| | - download_images(df, print_every=1000): Download images from URLs in a DataFrame. |
| | Args: |
| | - df (pd.DataFrame): DataFrame containing image URLs. |
| | - print_every (int): Print progress every n images. |
| | Returns: |
| | pd.DataFrame: DataFrame with image paths added. |
| | |
| | Example: |
| | downloader = ImageDownloader() |
| | df = downloader.download_images(df) |
| | """ |
| |
|
| | def __init__( |
| | self, image_dir="data/images/", image_size=(224, 224), overwrite=False |
| | ): |
| | self.image_dir = image_dir |
| | self.image_size = image_size |
| | self.overwrite = overwrite |
| |
|
| | |
| | if not os.path.exists(self.image_dir): |
| | os.makedirs(self.image_dir) |
| |
|
| | def download_images(self, df, print_every=1000): |
| | |
| | image_paths = [] |
| |
|
| | i = 0 |
| | for index, row in df.iterrows(): |
| | if i % print_every == 0: |
| | print(f"Downloading image {i}/{len(df)}") |
| | i += 1 |
| |
|
| | sku = row["sku"] |
| | image_url = row["image"] |
| | image_path = os.path.join(self.image_dir, f"{sku}.jpg") |
| |
|
| | if os.path.exists(image_path) and not self.overwrite: |
| | print(f"Image {sku} is already in the path.") |
| | image_paths.append(image_path) |
| | continue |
| |
|
| | try: |
| | response = requests.get(image_url) |
| | response.raise_for_status() |
| | img = Image.open(BytesIO(response.content)) |
| | img = img.resize(self.image_size, Image.Resampling.LANCZOS) |
| | img.save(image_path) |
| | |
| | image_paths.append(image_path) |
| | except Exception as e: |
| | print(f"Could not download image for SKU: {sku}. Error: {e}") |
| | image_paths.append(np.nan) |
| |
|
| | df["image_path"] = image_paths |
| | return df |
| |
|
| |
|
| | def train_test_split_and_feature_extraction(df, test_size=0.3, random_state=42): |
| | """ |
| | Split the data into train and test sets and extract features and labels. |
| | |
| | Args: |
| | - df (pd.DataFrame): DataFrame containing the data. |
| | |
| | Keyword Args: |
| | - test_size (float): Size of the test set. |
| | - random_state (int): Random state for reproducibility |
| | |
| | Returns: |
| | pd.DataFrame: Train DataFrame. |
| | pd.DataFrame: Test DataFrame. |
| | list: List of columns with text embeddings. |
| | list: List of columns with image embeddings. |
| | list: List of columns with class labels. |
| | |
| | Example: |
| | train_df, test_df, text_columns, image_columns, label_columns = train_test_split_and_feature_extraction(df) |
| | """ |
| |
|
| | |
| | train_df, test_df = train_test_split( |
| | df, test_size=test_size, random_state=random_state |
| | ) |
| |
|
| | |
| | text_columns = [col for col in df.columns if col.startswith("text_")] |
| |
|
| | |
| | image_columns = [col for col in df.columns if col.startswith("image_")] |
| |
|
| | |
| | label_columns = ["class_id"] |
| |
|
| | return train_df, test_df, text_columns, image_columns, label_columns |
| |
|