Transformer_Project/ ├── datasets/ │ └── coco_dataset/ # Data for training and testing │ ├── train2017/ │ └── annotations/ ├── models/ │ ├── text_to_image_transformer_original.pth # Original trained model │ └── text_to_image_transformer_optimized.pth # Optimized model ├── results/ │ ├── generated_images_original/ # Images generated by the original model │ └── generated_images_optimized/ # Images generated by the optimized model └── notebooks/ ├── data_preprocessing.ipynb # Data loading and preprocessing ├── model_training.ipynb # Model building and training ├── optimization_training.ipynb # Optimized training with techniques like mixed precision ├── generate_images.ipynb # Generating images with trained models └── analysis_and_visualization.ipynb # Analyzing and visualizing results
This notebook handles data loading and preprocessing using COCO dataset. It includes steps to mount Google Drive, install necessary libraries, and preprocess data.
from google.colab import drive # Mount Google Drive drive.mount('/content/drive') # Install necessary libraries !pip install transformers !pip install datasets # Set dataset path dataset_path = '/content/drive/My Drive/Transformer_Project/datasets/coco_dataset/' from datasets import load_dataset from transformers import BertTokenizer # Load and preprocess dataset dataset = load_dataset("coco", split="train") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") def preprocess_data(example): tokens = tokenizer(example['text'], padding="max_length", truncation=True, return_tensors="pt") return tokens dataset = dataset.map(preprocess_data)
This notebook is used to build the Transformer model and perform initial training.
import torch import torch.nn as nn from transformers import BertModel # Transformer Model Class class TextToImageTransformer(nn.Module): def __init__(self): super(TextToImageTransformer, self).__init__() self.text_encoder = BertModel.from_pretrained('bert-base-uncased') self.gan_decoder = nn.Sequential( nn.Linear(768, 1024), nn.ReLU(), nn.Linear(1024, 256*256*3), nn.Tanh() ) def forward(self, text_inputs): text_features = self.text_encoder(**text_inputs).last_hidden_state.mean(dim=1) generated_image = self.gan_decoder(text_features) return generated_image.view(-1, 3, 256, 256) # Set device and instantiate model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = TextToImageTransformer().to(device)
This notebook focuses on optimizing training with techniques like mixed precision.
from torch.cuda.amp import GradScaler, autocast optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) loss_fn = nn.MSELoss() scaler = GradScaler() # Assume we have a dataloader for epoch in range(10): for batch in dataloader: text_inputs = {k: v.to(device) for k, v in batch['text_inputs'].items()} target_images = batch['images'].to(device) optimizer.zero_grad() with autocast(): output_images = model(text_inputs) loss = loss_fn(output_images, target_images) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() print(f'Epoch {epoch+1}, Loss: {loss.item()}')
Generate images using the trained models, and save them to Google Drive.
import torch from PIL import Image import numpy as np # Load trained model model_path = '/content/drive/My Drive/Transformer_Project/models/text_to_image_transformer_original.pth' model.load_state_dict(torch.load(model_path)) model.eval() # Input text and generate image input_text = "A dog playing with a ball in the park." tokens = tokenizer(input_text, padding="max_length", truncation=True, return_tensors="pt") with torch.no_grad(): generated_image = model(tokens) # Save generated image image_array = (np.clip(generated_image[0].permute(1, 2, 0).numpy(), 0, 1) * 255).astype(np.uint8) image = Image.fromarray(image_array) image.save('/content/drive/My Drive/Transformer_Project/results/generated_images_original/generated_image_1.png')
Analyze the results by comparing images generated by different versions of the model, and visualize using Matplotlib.
import matplotlib.pyplot as plt from PIL import Image # Load images img_original = Image.open('/content/drive/My Drive/Transformer_Project/results/generated_images_original/generated_image_1.png') img_optimized = Image.open('/content/drive/My Drive/Transformer_Project/results/generated_images_optimized/generated_image_1.png') # Visualization fig, ax = plt.subplots(1, 2, figsize=(12, 6)) ax[0].imshow(img_original) ax[0].set_title("Original Model") ax[0].axis('off') ax[1].imshow(img_optimized) ax[1].set_title("Optimized Model") ax[1].axis('off') plt.show()