Replace helper functions with native burn functions

This commit is contained in:
Gadersd
2023-09-07 12:23:18 -04:00
parent a62795347f
commit f4c58c1790
20 changed files with 1091 additions and 950 deletions

BIN
img0.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 671 KiB

After

Width:  |  Height:  |  Size: 677 KiB

View File

@@ -1,26 +1,27 @@
use std::env; use std::env;
use std::process;
use std::error::Error; use std::error::Error;
use std::process;
use stablediffusion::model::stablediffusion::{StableDiffusion, load::load_stable_diffusion}; use stablediffusion::model::stablediffusion::{load::load_stable_diffusion, StableDiffusion};
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
nn, nn,
tensor::{ tensor::{backend::Backend, Tensor},
backend::Backend,
Tensor,
},
}; };
use burn_ndarray::{NdArrayBackend, NdArrayDevice}; use burn_ndarray::{NdArrayBackend, NdArrayDevice};
use burn::record::{self, Recorder, BinFileRecorder, FullPrecisionSettings}; use burn::record::{self, BinFileRecorder, FullPrecisionSettings, Recorder};
fn convert_dump_to_model<B: Backend>(dump_path: &str, model_name: &str, device: &B::Device) -> Result<(), Box<dyn Error>> { fn convert_dump_to_model<B: Backend>(
dump_path: &str,
model_name: &str,
device: &B::Device,
) -> Result<(), Box<dyn Error>> {
println!("Loading dump..."); println!("Loading dump...");
let model: StableDiffusion::<B> = load_stable_diffusion(dump_path, device)?; let model: StableDiffusion<B> = load_stable_diffusion(dump_path, device)?;
println!("Saving model..."); println!("Saving model...");
save_model_file(model, model_name)?; save_model_file(model, model_name)?;
@@ -28,12 +29,11 @@ fn convert_dump_to_model<B: Backend>(dump_path: &str, model_name: &str, device:
Ok(()) Ok(())
} }
fn save_model_file<B: Backend>(model: StableDiffusion<B>, name: &str) -> Result<(), record::RecorderError> { fn save_model_file<B: Backend>(
BinFileRecorder::<FullPrecisionSettings>::new() model: StableDiffusion<B>,
.record( name: &str,
model.into_record(), ) -> Result<(), record::RecorderError> {
name.into(), BinFileRecorder::<FullPrecisionSettings>::new().record(model.into_record(), name.into())
)
} }
fn main() { fn main() {

View File

@@ -1,13 +1,13 @@
use stablediffusion::{tokenizer::SimpleTokenizer, model::stablediffusion::{*, load::load_stable_diffusion}}; use stablediffusion::{
model::stablediffusion::{load::load_stable_diffusion, *},
tokenizer::SimpleTokenizer,
};
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
nn, nn,
tensor::{ tensor::{backend::Backend, Tensor},
backend::Backend,
Tensor,
},
}; };
cfg_if::cfg_if! { cfg_if::cfg_if! {
@@ -22,12 +22,14 @@ use std::env;
use std::io; use std::io;
use std::process; use std::process;
use burn::record::{self, Recorder, BinFileRecorder, FullPrecisionSettings}; use burn::record::{self, BinFileRecorder, FullPrecisionSettings, Recorder};
fn load_stable_diffusion_model_file<B: Backend>(filename: &str) -> Result<StableDiffusion<B>, record::RecorderError> { fn load_stable_diffusion_model_file<B: Backend>(
filename: &str,
) -> Result<StableDiffusion<B>, record::RecorderError> {
BinFileRecorder::<FullPrecisionSettings>::new() BinFileRecorder::<FullPrecisionSettings>::new()
.load(filename.into()) .load(filename.into())
.map(|record| StableDiffusionConfig::new().init().load_record(record)) .map(|record| StableDiffusionConfig::new().init().load_record(record))
} }
fn main() { fn main() {
@@ -78,17 +80,22 @@ fn main() {
let sd = sd.to_device(&device); let sd = sd.to_device(&device);
let unconditional_context = sd.unconditional_context(&tokenizer); let unconditional_context = sd.unconditional_context(&tokenizer);
let context = sd.context(&tokenizer, prompt).unsqueeze::<3>();//.repeat(0, 2); // generate 2 samples let context = sd.context(&tokenizer, prompt).unsqueeze::<3>(); //.repeat(0, 2); // generate 2 samples
println!("Sampling image..."); println!("Sampling image...");
let images = sd.sample_image(context, unconditional_context, unconditional_guidance_scale, n_steps); let images = sd.sample_image(
context,
unconditional_context,
unconditional_guidance_scale,
n_steps,
);
save_images(&images, output_image_name, 512, 512).unwrap_or_else(|err| { save_images(&images, output_image_name, 512, 512).unwrap_or_else(|err| {
eprintln!("Error saving image: {}", err); eprintln!("Error saving image: {}", err);
process::exit(1); process::exit(1);
}); });
} }
use image::{self, ImageResult, ColorType::Rgb8}; use image::{self, ColorType::Rgb8, ImageResult};
fn save_images(images: &Vec<Vec<u8>>, basepath: &str, width: u32, height: u32) -> ImageResult<()> { fn save_images(images: &Vec<Vec<u8>>, basepath: &str, width: u32, height: u32) -> ImageResult<()> {
for (index, img_data) in images.iter().enumerate() { for (index, img_data) in images.iter().enumerate() {
@@ -103,12 +110,15 @@ fn save_images(images: &Vec<Vec<u8>>, basepath: &str, width: u32, height: u32) -
fn save_test_image() -> ImageResult<()> { fn save_test_image() -> ImageResult<()> {
let width = 256; let width = 256;
let height = 256; let height = 256;
let raw: Vec<_> = (0..width * height).into_iter().flat_map(|i| { let raw: Vec<_> = (0..width * height)
let row = i / width; .into_iter()
let red = (255.0 * row as f64 / height as f64) as u8; .flat_map(|i| {
let row = i / width;
let red = (255.0 * row as f64 / height as f64) as u8;
[red, 0, 0] [red, 0, 0]
}).collect(); })
.collect();
image::save_buffer("red.png", &raw[..], width, height, Rgb8) image::save_buffer("red.png", &raw[..], width, height, Rgb8)
} }

View File

@@ -1,87 +0,0 @@
use burn::{
tensor::{
backend::Backend,
activation::relu,
Tensor,
Int,
Bool,
Float,
TensorKind,
BasicOps,
Numeric,
Element,
},
};
use num_traits::ToPrimitive;
pub fn tensor_max_scalar<B: Backend, const D: usize>(x: Tensor<B, D>, max: f64) -> Tensor<B, D> {
relu(x.sub_scalar(max)).add_scalar(max)
}
pub fn tensor_min_scalar<B: Backend, const D: usize>(x: Tensor<B, D>, min: f64) -> Tensor<B, D> {
-tensor_max_scalar(-x, -min)
}
pub fn tensor_max<B: Backend, const D: usize>(x: Tensor<B, D>, max: Tensor<B, D>) -> Tensor<B, D> {
relu(x - max.clone()) + max
}
pub fn tensor_min<B: Backend, const D: usize>(x: Tensor<B, D>, min: Tensor<B, D>) -> Tensor<B, D> {
-tensor_max(-x, -min)
}
pub fn tensor_log10<B: Backend, const D: usize>(x: Tensor<B, D>) -> Tensor<B, D> {
let ln10 = (10.0f64).ln();
x.log() / ln10
}
pub fn tensor_max_element<B: Backend, const D: usize>(x: Tensor<B, D>) -> f64 {
let flat: Tensor<B, 1> = x.flatten(0, D - 1);
let max_index = flat.clone().argmax(0);
flat.select(0, max_index).into_scalar().to_f64().unwrap()
}
pub fn all_zeros<B: Backend, const D: usize>(x: Tensor<B, D>) -> bool {
x.powf(2.0).sum().into_scalar().to_f64().unwrap() == 0.0
}
pub fn max_dim<B: Backend>(x: Tensor<B, 2>, dim: usize) -> Tensor<B, 2> {
let indices = x.clone().argmax(dim).flatten(0, 1);
x.select(dim, indices)
}
pub fn _10pow<B: Backend, const D: usize>(x: Tensor<B, D>) -> Tensor<B, D> {
let log10 = (10.0f64).ln();
(x * log10).exp()
}
pub fn to_float<B: Backend, const D: usize>(x: Tensor<B, D, Int>) -> Tensor<B, D, Float> {
let device = x.device();
Tensor::from_data(
x
.into_data()
.convert()
).to_device(&device)
}
pub fn to_float_bool<B: Backend, const D: usize>(x: Tensor<B, D, Bool>) -> Tensor<B, D, Float> {
let device = x.device();
Tensor::from_data(
x
.into_data()
.convert()
).to_device(&device)
}
pub fn reverse<B: Backend, const D: usize, K: TensorKind<B> + BasicOps<B> + Numeric<B>>(x: Tensor<B, D, K>, dim: usize) -> Tensor<B, D, K> where <K as BasicOps<B>>::Elem: Element {
let len = x.dims()[dim];
let indices = -Tensor::arange_device(0..len, &x.device()) + (len - 1) as i64;
x.select(dim, indices)
}
pub fn div_roundup(x: usize, y: usize) -> usize {
(x + y - 1) / y
}

View File

@@ -1,3 +1,2 @@
pub mod model; pub mod model;
pub mod tokenizer; pub mod tokenizer;
pub mod helper;

View File

@@ -1,23 +1,32 @@
use burn::{ use burn::tensor::{activation::softmax, backend::Backend, Tensor};
tensor::{
backend::Backend,
activation::softmax,
Tensor,
},
};
use std::f32::NEG_INFINITY; use std::f32::NEG_INFINITY;
pub fn qkv_attention<B: Backend>(q: Tensor<B, 3>, k: Tensor<B, 3>, v: Tensor<B, 3>, mask: Option<Tensor<B, 2>>, n_head: usize) -> Tensor<B, 3> { pub fn qkv_attention<B: Backend>(
q: Tensor<B, 3>,
k: Tensor<B, 3>,
v: Tensor<B, 3>,
mask: Option<Tensor<B, 2>>,
n_head: usize,
) -> Tensor<B, 3> {
let [n_batch, n_qctx, n_state] = q.dims(); let [n_batch, n_qctx, n_state] = q.dims();
let [_, n_ctx, _] = k.dims(); let [_, n_ctx, _] = k.dims();
let scale = (n_state as f64 / n_head as f64).powf(-0.25); let scale = (n_state as f64 / n_head as f64).powf(-0.25);
let n_hstate = n_state / n_head; let n_hstate = n_state / n_head;
let q = q.reshape([n_batch, n_qctx, n_head, n_hstate]).swap_dims(1, 2) * scale; let q = q
let k = k.reshape([n_batch, n_ctx, n_head, n_hstate]).swap_dims(1, 2).transpose() * scale; .reshape([n_batch, n_qctx, n_head, n_hstate])
let v = v.reshape([n_batch, n_ctx, n_head, n_hstate]).swap_dims(1, 2); .swap_dims(1, 2)
* scale;
let k = k
.reshape([n_batch, n_ctx, n_head, n_hstate])
.swap_dims(1, 2)
.transpose()
* scale;
let v = v
.reshape([n_batch, n_ctx, n_head, n_hstate])
.swap_dims(1, 2);
let qk = q.matmul(k); let qk = q.matmul(k);
@@ -44,4 +53,4 @@ pub fn attn_decoder_mask<B: Backend>(seq_length: usize, device: &B::Device) -> T
} }
return mask.to_device(device); return mask.to_device(device);
} }

View File

@@ -4,29 +4,38 @@ use crate::model::load::*;
use std::error::Error; use std::error::Error;
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
nn, nn,
tensor::{ tensor::{backend::Backend, Tensor},
backend::Backend,
Tensor,
},
}; };
use super::*; use super::*;
use crate::model::groupnorm::load::load_group_norm; use crate::model::groupnorm::load::load_group_norm;
fn load_conv_self_attention_block<B: Backend>(path: &str, device: &B::Device) -> Result<ConvSelfAttentionBlock<B>, Box<dyn Error>> { fn load_conv_self_attention_block<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<ConvSelfAttentionBlock<B>, Box<dyn Error>> {
let norm = load_group_norm(&format!("{}/{}", path, "norm"), device)?; let norm = load_group_norm(&format!("{}/{}", path, "norm"), device)?;
let q = load_conv2d(&format!("{}/{}", path, "q"), device)?; let q = load_conv2d(&format!("{}/{}", path, "q"), device)?;
let k = load_conv2d(&format!("{}/{}", path, "k"), device)?; let k = load_conv2d(&format!("{}/{}", path, "k"), device)?;
let v = load_conv2d(&format!("{}/{}", path, "v"), device)?; let v = load_conv2d(&format!("{}/{}", path, "v"), device)?;
let proj_out = load_conv2d(&format!("{}/{}", path, "proj_out"), device)?; let proj_out = load_conv2d(&format!("{}/{}", path, "proj_out"), device)?;
Ok(ConvSelfAttentionBlock { norm, q, k, v, proj_out }) Ok(ConvSelfAttentionBlock {
norm,
q,
k,
v,
proj_out,
})
} }
fn load_resnet_block<B: Backend>(path: &str, device: &B::Device) -> Result<ResnetBlock<B>, Box<dyn Error>> { fn load_resnet_block<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<ResnetBlock<B>, Box<dyn Error>> {
let norm1 = load_group_norm(&format!("{}/{}", path, "norm1"), device)?; let norm1 = load_group_norm(&format!("{}/{}", path, "norm1"), device)?;
let silu1 = SILU {}; let silu1 = SILU {};
let conv1 = load_conv2d(&format!("{}/{}", path, "conv1"), device)?; let conv1 = load_conv2d(&format!("{}/{}", path, "conv1"), device)?;
@@ -35,7 +44,15 @@ fn load_resnet_block<B: Backend>(path: &str, device: &B::Device) -> Result<Resne
let conv2 = load_conv2d(&format!("{}/{}", path, "conv2"), device)?; let conv2 = load_conv2d(&format!("{}/{}", path, "conv2"), device)?;
let nin_shortcut = load_conv2d(&format!("{}/{}", path, "nin_shortcut"), device).ok(); let nin_shortcut = load_conv2d(&format!("{}/{}", path, "nin_shortcut"), device).ok();
Ok(ResnetBlock { norm1, silu1, conv1, norm2, silu2, conv2, nin_shortcut }) Ok(ResnetBlock {
norm1,
silu1,
conv1,
norm2,
silu2,
conv2,
nin_shortcut,
})
} }
fn load_mid<B: Backend>(path: &str, device: &B::Device) -> Result<Mid<B>, Box<dyn Error>> { fn load_mid<B: Backend>(path: &str, device: &B::Device) -> Result<Mid<B>, Box<dyn Error>> {
@@ -43,14 +60,21 @@ fn load_mid<B: Backend>(path: &str, device: &B::Device) -> Result<Mid<B>, Box<dy
let attn = load_conv_self_attention_block(&format!("{}/{}", path, "attn"), device)?; let attn = load_conv_self_attention_block(&format!("{}/{}", path, "attn"), device)?;
let block_2 = load_resnet_block(&format!("{}/{}", path, "block_2"), device)?; let block_2 = load_resnet_block(&format!("{}/{}", path, "block_2"), device)?;
Ok(Mid { block_1, attn, block_2 }) Ok(Mid {
block_1,
attn,
block_2,
})
} }
fn load_padded_conv2d<B: Backend>(path: &str, device: &B::Device) -> Result<PaddedConv2d<B>, Box<dyn Error>> { fn load_padded_conv2d<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<PaddedConv2d<B>, Box<dyn Error>> {
let conv = load_conv2d(&format!("{}/{}", path, "conv"), device)?; let conv = load_conv2d(&format!("{}/{}", path, "conv"), device)?;
let channels = load_tensor::<B, 1>("channels", path, device)?; let channels = load_tensor::<B, 1>("channels", path, device)?;
let channels = tensor_to_array_2(channels); let channels = tensor_to_array_2(channels);
let kernel_size = load_usize::<B>("kernel_size", path, device)?; let kernel_size = load_usize::<B>("kernel_size", path, device)?;
let stride = load_usize::<B>("stride", path, device)?; let stride = load_usize::<B>("stride", path, device)?;
@@ -61,31 +85,48 @@ fn load_padded_conv2d<B: Backend>(path: &str, device: &B::Device) -> Result<Padd
let mut record = conv.into_record(); let mut record = conv.into_record();
let mut padded_conv: PaddedConv2d<B> = PaddedConv2dConfig::new(channels, kernel_size, padding).with_stride(stride).init(); let mut padded_conv: PaddedConv2d<B> = PaddedConv2dConfig::new(channels, kernel_size, padding)
let padding_actual = PaddingConfig2d::Explicit(padded_conv.padding_actual[0], padded_conv.padding_actual[1]); .with_stride(stride)
.init();
let padding_actual =
PaddingConfig2d::Explicit(padded_conv.padding_actual[0], padded_conv.padding_actual[1]);
record.padding = <PaddingConfig2d as Module<B>>::into_record(padding_actual); record.padding = <PaddingConfig2d as Module<B>>::into_record(padding_actual);
padded_conv.conv = padded_conv.conv.load_record(record); padded_conv.conv = padded_conv.conv.load_record(record);
Ok(padded_conv) Ok(padded_conv)
} }
fn load_decoder_block<B: Backend>(path: &str, device: &B::Device) -> Result<DecoderBlock<B>, Box<dyn Error>> { fn load_decoder_block<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<DecoderBlock<B>, Box<dyn Error>> {
let res1 = load_resnet_block(&format!("{}/{}", path, "res1"), device)?; let res1 = load_resnet_block(&format!("{}/{}", path, "res1"), device)?;
let res2 = load_resnet_block(&format!("{}/{}", path, "res2"), device)?; let res2 = load_resnet_block(&format!("{}/{}", path, "res2"), device)?;
let res3 = load_resnet_block(&format!("{}/{}", path, "res3"), device)?; let res3 = load_resnet_block(&format!("{}/{}", path, "res3"), device)?;
let upsampler = load_conv2d(&format!("{}/{}", path, "upsampler"), device).ok(); let upsampler = load_conv2d(&format!("{}/{}", path, "upsampler"), device).ok();
Ok(DecoderBlock { res1, res2, res3, upsampler }) Ok(DecoderBlock {
res1,
res2,
res3,
upsampler,
})
} }
fn load_encoder_block<B: Backend>(path: &str, device: &B::Device) -> Result<EncoderBlock<B>, Box<dyn Error>> { fn load_encoder_block<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<EncoderBlock<B>, Box<dyn Error>> {
let res1 = load_resnet_block(&format!("{}/{}", path, "res1"), device)?; let res1 = load_resnet_block(&format!("{}/{}", path, "res1"), device)?;
let res2 = load_resnet_block(&format!("{}/{}", path, "res2"), device)?; let res2 = load_resnet_block(&format!("{}/{}", path, "res2"), device)?;
let downsampler = load_padded_conv2d(&format!("{}/{}", path, "downsampler"), device).ok(); let downsampler = load_padded_conv2d(&format!("{}/{}", path, "downsampler"), device).ok();
Ok(EncoderBlock { res1, res2, downsampler }) Ok(EncoderBlock {
res1,
res2,
downsampler,
})
} }
fn load_decoder<B: Backend>(path: &str, device: &B::Device) -> Result<Decoder<B>, Box<dyn Error>> { fn load_decoder<B: Backend>(path: &str, device: &B::Device) -> Result<Decoder<B>, Box<dyn Error>> {
@@ -95,15 +136,21 @@ fn load_decoder<B: Backend>(path: &str, device: &B::Device) -> Result<Decoder<B>
let n_block = load_usize::<B>("n_block", path, device)?; let n_block = load_usize::<B>("n_block", path, device)?;
let mut blocks = (0..n_block) let mut blocks = (0..n_block)
.into_iter() .into_iter()
.map(|i| { .map(|i| load_decoder_block::<B>(&format!("{}/blocks/{}", path, i), device))
load_decoder_block::<B>(&format!("{}/blocks/{}", path, i), device) .collect::<Result<Vec<_>, _>>()?;
}).collect::<Result<Vec<_>, _>>()?;
let norm_out = load_group_norm(&format!("{}/{}", path, "norm_out"), device)?; let norm_out = load_group_norm(&format!("{}/{}", path, "norm_out"), device)?;
let silu = SILU {}; let silu = SILU {};
let conv_out = load_conv2d(&format!("{}/{}", path, "conv_out"), device)?; let conv_out = load_conv2d(&format!("{}/{}", path, "conv_out"), device)?;
Ok(Decoder { conv_in, mid, blocks, norm_out, silu, conv_out }) Ok(Decoder {
conv_in,
mid,
blocks,
norm_out,
silu,
conv_out,
})
} }
fn load_encoder<B: Backend>(path: &str, device: &B::Device) -> Result<Encoder<B>, Box<dyn Error>> { fn load_encoder<B: Backend>(path: &str, device: &B::Device) -> Result<Encoder<B>, Box<dyn Error>> {
@@ -113,22 +160,36 @@ fn load_encoder<B: Backend>(path: &str, device: &B::Device) -> Result<Encoder<B>
let n_block = load_usize::<B>("n_block", path, device)?; let n_block = load_usize::<B>("n_block", path, device)?;
let mut blocks = (0..n_block) let mut blocks = (0..n_block)
.into_iter() .into_iter()
.map(|i| { .map(|i| load_encoder_block::<B>(&format!("{}/blocks/{}", path, i), device))
load_encoder_block::<B>(&format!("{}/blocks/{}", path, i), device) .collect::<Result<Vec<_>, _>>()?;
}).collect::<Result<Vec<_>, _>>()?;
let norm_out = load_group_norm(&format!("{}/{}", path, "norm_out"), device)?; let norm_out = load_group_norm(&format!("{}/{}", path, "norm_out"), device)?;
let silu = SILU {}; let silu = SILU {};
let conv_out = load_conv2d(&format!("{}/{}", path, "conv_out"), device)?; let conv_out = load_conv2d(&format!("{}/{}", path, "conv_out"), device)?;
Ok(Encoder { conv_in, mid, blocks, norm_out, silu, conv_out }) Ok(Encoder {
conv_in,
mid,
blocks,
norm_out,
silu,
conv_out,
})
} }
pub fn load_autoencoder<B: Backend>(path: &str, device: &B::Device) -> Result<Autoencoder<B>, Box<dyn Error>> { pub fn load_autoencoder<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<Autoencoder<B>, Box<dyn Error>> {
let encoder = load_encoder(&format!("{}/{}", path, "encoder"), device)?; let encoder = load_encoder(&format!("{}/{}", path, "encoder"), device)?;
let decoder = load_decoder(&format!("{}/{}", path, "decoder"), device)?; let decoder = load_decoder(&format!("{}/{}", path, "decoder"), device)?;
let quant_conv = load_conv2d(&format!("{}/{}", path, "quant_conv"), device)?; let quant_conv = load_conv2d(&format!("{}/{}", path, "quant_conv"), device)?;
let post_quant_conv = load_conv2d(&format!("{}/{}", path, "post_quant_conv"), device)?; let post_quant_conv = load_conv2d(&format!("{}/{}", path, "post_quant_conv"), device)?;
Ok(Autoencoder { encoder, decoder, quant_conv, post_quant_conv }) Ok(Autoencoder {
} encoder,
decoder,
quant_conv,
post_quant_conv,
})
}

View File

@@ -1,59 +1,59 @@
pub mod load; pub mod load;
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
nn::{self, PaddingConfig2d, conv::{Conv2d, Conv2dConfig, Conv2dRecord}}, nn::{
self,
conv::{Conv2d, Conv2dConfig, Conv2dRecord},
PaddingConfig2d,
},
tensor::{ tensor::{
activation::{sigmoid, softmax},
backend::Backend, backend::Backend,
activation::{softmax, sigmoid}, module::embedding,
module::embedding, Distribution, Int, Tensor,
Tensor,
Distribution,
Int,
}, },
}; };
use crate::helper::div_roundup;
use super::silu::*;
use super::groupnorm::*;
use super::attention::qkv_attention; use super::attention::qkv_attention;
use super::groupnorm::*;
use super::silu::*;
use std::iter; use std::iter;
#[derive(Config)] #[derive(Config)]
pub struct AutoencoderConfig {} pub struct AutoencoderConfig {}
impl AutoencoderConfig { impl AutoencoderConfig {
pub fn init<B: Backend>(&self) -> Autoencoder<B> { pub fn init<B: Backend>(&self) -> Autoencoder<B> {
let encoder = EncoderConfig::new(vec![(128, 128), (128, 256), (256, 512), (512, 512)], 32, 8).init(); let encoder =
let decoder = DecoderConfig::new(vec![(512, 512), (512, 512), (512, 256), (256, 128)], 32).init(); EncoderConfig::new(vec![(128, 128), (128, 256), (256, 512), (512, 512)], 32, 8).init();
let decoder =
DecoderConfig::new(vec![(512, 512), (512, 512), (512, 256), (256, 128)], 32).init();
let quant_conv = Conv2dConfig::new([8, 8], [1, 1]).init(); let quant_conv = Conv2dConfig::new([8, 8], [1, 1]).init();
let post_quant_conv = Conv2dConfig::new([4, 4], [1, 1]).init(); let post_quant_conv = Conv2dConfig::new([4, 4], [1, 1]).init();
Autoencoder { Autoencoder {
encoder, encoder,
decoder, decoder,
quant_conv, quant_conv,
post_quant_conv, post_quant_conv,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct Autoencoder<B: Backend> { pub struct Autoencoder<B: Backend> {
encoder: Encoder<B>, encoder: Encoder<B>,
decoder: Decoder<B>, decoder: Decoder<B>,
quant_conv: Conv2d<B>, quant_conv: Conv2d<B>,
post_quant_conv: Conv2d<B>, post_quant_conv: Conv2d<B>,
} }
impl<B: Backend> Autoencoder<B> { impl<B: Backend> Autoencoder<B> {
pub fn forward(&self, x: Tensor<B, 4>) -> Tensor<B, 4> { pub fn forward(&self, x: Tensor<B, 4>) -> Tensor<B, 4> {
self.decode_latent( self.encode_image(x) ) self.decode_latent(self.encode_image(x))
} }
pub fn encode_image(&self, x: Tensor<B, 4>) -> Tensor<B, 4> { pub fn encode_image(&self, x: Tensor<B, 4>) -> Tensor<B, 4> {
@@ -72,48 +72,60 @@ impl<B: Backend> Autoencoder<B> {
#[derive(Config)] #[derive(Config)]
pub struct EncoderConfig { pub struct EncoderConfig {
channels: Vec<(usize, usize)>, channels: Vec<(usize, usize)>,
n_group: usize, n_group: usize,
n_channels_out: usize, n_channels_out: usize,
} }
impl EncoderConfig { impl EncoderConfig {
fn init<B: Backend>(&self) -> Encoder<B> { fn init<B: Backend>(&self) -> Encoder<B> {
let n_expanded_channels_initial = self.channels.first().map(|f| f.1).expect("Channels must not be empty."); let n_expanded_channels_initial = self
.channels
.first()
.map(|f| f.1)
.expect("Channels must not be empty.");
let n_expanded_channels_final = self.channels.first().unwrap().0; let n_expanded_channels_final = self.channels.first().unwrap().0;
let conv_in = Conv2dConfig::new([3, n_expanded_channels_initial], [3, 3]).with_padding(PaddingConfig2d::Explicit(1, 1)).init(); let conv_in = Conv2dConfig::new([3, n_expanded_channels_initial], [3, 3])
.with_padding(PaddingConfig2d::Explicit(1, 1))
.init();
let blocks = self.channels.iter().enumerate().map(|(i, &(n_channel_in, n_channel_out))| { let blocks = self
let downsample = i != self.channels.len() - 1; .channels
EncoderBlockConfig::new(n_channel_in, n_channel_out, downsample).init() .iter()
}).collect(); .enumerate()
.map(|(i, &(n_channel_in, n_channel_out))| {
let downsample = i != self.channels.len() - 1;
EncoderBlockConfig::new(n_channel_in, n_channel_out, downsample).init()
})
.collect();
let mid = MidConfig::new(n_expanded_channels_final).init(); let mid = MidConfig::new(n_expanded_channels_final).init();
let norm_out = GroupNormConfig::new(self.n_group, n_expanded_channels_final).init(); let norm_out = GroupNormConfig::new(self.n_group, n_expanded_channels_final).init();
let silu = SILU::new(); let silu = SILU::new();
let conv_out = Conv2dConfig::new([n_expanded_channels_final, self.n_channels_out], [3, 3]).with_padding(PaddingConfig2d::Explicit(1, 1)).init(); let conv_out = Conv2dConfig::new([n_expanded_channels_final, self.n_channels_out], [3, 3])
.with_padding(PaddingConfig2d::Explicit(1, 1))
.init();
Encoder { Encoder {
conv_in, conv_in,
mid, mid,
blocks, blocks,
norm_out, norm_out,
silu, silu,
conv_out, conv_out,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct Encoder<B: Backend> { pub struct Encoder<B: Backend> {
conv_in: Conv2d<B>, conv_in: Conv2d<B>,
mid: Mid<B>, mid: Mid<B>,
blocks: Vec<EncoderBlock<B>>, blocks: Vec<EncoderBlock<B>>,
norm_out: GroupNorm<B>, norm_out: GroupNorm<B>,
silu: SILU, silu: SILU,
conv_out: Conv2d<B>, conv_out: Conv2d<B>,
} }
impl<B: Backend> Encoder<B> { impl<B: Backend> Encoder<B> {
@@ -126,55 +138,66 @@ impl<B: Backend> Encoder<B> {
} }
let x = self.mid.forward(x); let x = self.mid.forward(x);
self.conv_out.forward( self.silu.forward( self.norm_out.forward(x) ) ) self.conv_out
.forward(self.silu.forward(self.norm_out.forward(x)))
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct DecoderConfig { pub struct DecoderConfig {
channels: Vec<(usize, usize)>, channels: Vec<(usize, usize)>,
n_group: usize, n_group: usize,
} }
impl DecoderConfig { impl DecoderConfig {
fn init<B: Backend>(&self) -> Decoder<B> { fn init<B: Backend>(&self) -> Decoder<B> {
let n_expanded_channels = self.channels.first().map(|f| f.0).expect("Channels must not be empty."); let n_expanded_channels = self
.channels
.first()
.map(|f| f.0)
.expect("Channels must not be empty.");
let n_condensed_channels = self.channels.last().unwrap().1; let n_condensed_channels = self.channels.last().unwrap().1;
let conv_in = Conv2dConfig::new([4, n_expanded_channels], [3, 3]).with_padding(PaddingConfig2d::Explicit(1, 1)).init(); let conv_in = Conv2dConfig::new([4, n_expanded_channels], [3, 3])
.with_padding(PaddingConfig2d::Explicit(1, 1))
.init();
let mid = MidConfig::new(n_expanded_channels).init(); let mid = MidConfig::new(n_expanded_channels).init();
let blocks = self.channels.iter().enumerate().map(|(i, &(n_channel_in, n_channel_out))| { let blocks = self
let upsample = i != self.channels.len() - 1; .channels
DecoderBlockConfig::new(n_channel_in, n_channel_out, upsample).init() .iter()
}).collect(); .enumerate()
.map(|(i, &(n_channel_in, n_channel_out))| {
let upsample = i != self.channels.len() - 1;
DecoderBlockConfig::new(n_channel_in, n_channel_out, upsample).init()
})
.collect();
let norm_out = GroupNormConfig::new(self.n_group, n_condensed_channels).init(); let norm_out = GroupNormConfig::new(self.n_group, n_condensed_channels).init();
let silu = SILU::new(); let silu = SILU::new();
let conv_out = Conv2dConfig::new([n_condensed_channels, 3], [3, 3]).with_padding(PaddingConfig2d::Explicit(1, 1)).init(); let conv_out = Conv2dConfig::new([n_condensed_channels, 3], [3, 3])
.with_padding(PaddingConfig2d::Explicit(1, 1))
.init();
Decoder { Decoder {
conv_in, conv_in,
mid, mid,
blocks, blocks,
norm_out, norm_out,
silu, silu,
conv_out, conv_out,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct Decoder<B: Backend> { pub struct Decoder<B: Backend> {
conv_in: Conv2d<B>, conv_in: Conv2d<B>,
mid: Mid<B>, mid: Mid<B>,
blocks: Vec<DecoderBlock<B>>, blocks: Vec<DecoderBlock<B>>,
norm_out: GroupNorm<B>, norm_out: GroupNorm<B>,
silu: SILU, silu: SILU,
conv_out: Conv2d<B>, conv_out: Conv2d<B>,
} }
impl<B: Backend> Decoder<B> { impl<B: Backend> Decoder<B> {
@@ -187,15 +210,16 @@ impl<B: Backend> Decoder<B> {
x = block.forward(x); x = block.forward(x);
} }
self.conv_out.forward( self.silu.forward( self.norm_out.forward(x) ) ) self.conv_out
.forward(self.silu.forward(self.norm_out.forward(x)))
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct EncoderBlockConfig { pub struct EncoderBlockConfig {
n_channels_in: usize, n_channels_in: usize,
n_channels_out: usize, n_channels_out: usize,
downsample: bool, downsample: bool,
} }
impl EncoderBlockConfig { impl EncoderBlockConfig {
@@ -204,24 +228,28 @@ impl EncoderBlockConfig {
let res2 = ResnetBlockConfig::new(self.n_channels_out, self.n_channels_out).init(); let res2 = ResnetBlockConfig::new(self.n_channels_out, self.n_channels_out).init();
let downsampler = if self.downsample { let downsampler = if self.downsample {
let padding = Padding::new(0, 1, 0, 1); let padding = Padding::new(0, 1, 0, 1);
Some( PaddedConv2dConfig::new([self.n_channels_out, self.n_channels_out], 3, padding).with_stride(2).init() ) Some(
PaddedConv2dConfig::new([self.n_channels_out, self.n_channels_out], 3, padding)
.with_stride(2)
.init(),
)
} else { } else {
None None
}; };
EncoderBlock { EncoderBlock {
res1, res1,
res2, res2,
downsampler, downsampler,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct EncoderBlock<B: Backend> { pub struct EncoderBlock<B: Backend> {
res1: ResnetBlock<B>, res1: ResnetBlock<B>,
res2: ResnetBlock<B>, res2: ResnetBlock<B>,
downsampler: Option<PaddedConv2d<B>>, downsampler: Option<PaddedConv2d<B>>,
} }
impl<B: Backend> EncoderBlock<B> { impl<B: Backend> EncoderBlock<B> {
@@ -238,9 +266,9 @@ impl<B: Backend> EncoderBlock<B> {
#[derive(Config)] #[derive(Config)]
pub struct DecoderBlockConfig { pub struct DecoderBlockConfig {
n_channels_in: usize, n_channels_in: usize,
n_channels_out: usize, n_channels_out: usize,
upsample: bool, upsample: bool,
} }
impl DecoderBlockConfig { impl DecoderBlockConfig {
@@ -249,26 +277,30 @@ impl DecoderBlockConfig {
let res2 = ResnetBlockConfig::new(self.n_channels_out, self.n_channels_out).init(); let res2 = ResnetBlockConfig::new(self.n_channels_out, self.n_channels_out).init();
let res3 = ResnetBlockConfig::new(self.n_channels_out, self.n_channels_out).init(); let res3 = ResnetBlockConfig::new(self.n_channels_out, self.n_channels_out).init();
let upsampler = if self.upsample { let upsampler = if self.upsample {
Some( Conv2dConfig::new([self.n_channels_out, self.n_channels_out], [3, 3]).with_padding(PaddingConfig2d::Explicit(1, 1)).init() ) Some(
Conv2dConfig::new([self.n_channels_out, self.n_channels_out], [3, 3])
.with_padding(PaddingConfig2d::Explicit(1, 1))
.init(),
)
} else { } else {
None None
}; };
DecoderBlock { DecoderBlock {
res1, res1,
res2, res2,
res3, res3,
upsampler, upsampler,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct DecoderBlock<B: Backend> { pub struct DecoderBlock<B: Backend> {
res1: ResnetBlock<B>, res1: ResnetBlock<B>,
res2: ResnetBlock<B>, res2: ResnetBlock<B>,
res3: ResnetBlock<B>, res3: ResnetBlock<B>,
upsampler: Option<Conv2d<B>>, upsampler: Option<Conv2d<B>>,
} }
impl<B: Backend> DecoderBlock<B> { impl<B: Backend> DecoderBlock<B> {
@@ -280,10 +312,10 @@ impl<B: Backend> DecoderBlock<B> {
if let Some(d) = self.upsampler.as_ref() { if let Some(d) = self.upsampler.as_ref() {
let [n_batch, n_channel, height, width] = x.dims(); let [n_batch, n_channel, height, width] = x.dims();
let x = x let x = x
.reshape([n_batch, n_channel, height, 1, width, 1]) .reshape([n_batch, n_channel, height, 1, width, 1])
.repeat(3, 2) .repeat(3, 2)
.repeat(5, 2) .repeat(5, 2)
.reshape([n_batch, n_channel, 2 * height, 2 * width]); .reshape([n_batch, n_channel, 2 * height, 2 * width]);
d.forward(x) d.forward(x)
} else { } else {
x x
@@ -291,14 +323,13 @@ impl<B: Backend> DecoderBlock<B> {
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct PaddedConv2dConfig { pub struct PaddedConv2dConfig {
channels: [usize; 2], channels: [usize; 2],
kernel_size: usize, kernel_size: usize,
#[config(default = 1)] #[config(default = 1)]
stride: usize, stride: usize,
padding: Padding, padding: Padding,
} }
impl PaddedConv2dConfig { impl PaddedConv2dConfig {
@@ -328,57 +359,68 @@ impl PaddedConv2dConfig {
let padding = self.padding; let padding = self.padding;
PaddedConv2d { PaddedConv2d {
conv, conv,
kernel_size, kernel_size,
stride, stride,
padding, padding,
padding_actual, padding_actual,
} }
} }
} }
fn div_roundup(x: usize, y: usize) -> usize {
(x + y - 1) / y
}
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct PaddedConv2d<B: Backend> { pub struct PaddedConv2d<B: Backend> {
conv: Conv2d<B>, conv: Conv2d<B>,
kernel_size: usize, kernel_size: usize,
stride: usize, stride: usize,
padding: Padding, padding: Padding,
padding_actual: [usize; 2], padding_actual: [usize; 2],
} }
impl<B: Backend> PaddedConv2d<B> { impl<B: Backend> PaddedConv2d<B> {
fn forward(&self, x: Tensor<B, 4>) -> Tensor<B, 4> { fn forward(&self, x: Tensor<B, 4>) -> Tensor<B, 4> {
println!("{} {} {:?} {:?}", self.kernel_size, self.stride, self.padding, self.padding_actual); println!(
"{} {} {:?} {:?}",
self.kernel_size, self.stride, self.padding, self.padding_actual
);
let [n_batch, n_channel, height, width] = x.dims(); let [n_batch, n_channel, height, width] = x.dims();
let desired_height = (self.padding.pad_top + self.padding.pad_bottom + height - self.kernel_size) / self.stride + 1; let desired_height = (self.padding.pad_top + self.padding.pad_bottom + height
let desired_width = (self.padding.pad_left + self.padding.pad_right + width - self.kernel_size) / self.stride + 1; - self.kernel_size)
/ self.stride
+ 1;
let desired_width = (self.padding.pad_left + self.padding.pad_right + width
- self.kernel_size)
/ self.stride
+ 1;
let skip_vert = (self.padding_actual[0] - self.padding.pad_top) / self.stride; let skip_vert = (self.padding_actual[0] - self.padding.pad_top) / self.stride;
let skip_hor = (self.padding_actual[1] - self.padding.pad_left) / self.stride; let skip_hor = (self.padding_actual[1] - self.padding.pad_left) / self.stride;
self.conv self.conv.forward(x).slice([
.forward(x) 0..n_batch,
.slice([ 0..n_channel,
0..n_batch, skip_vert..(skip_vert + desired_height),
0..n_channel, skip_hor..(skip_hor + desired_width),
skip_vert..(skip_vert + desired_height), ])
skip_hor..(skip_hor + desired_width)
])
} }
} }
#[derive(Config, Module, Copy, Debug)] #[derive(Config, Module, Copy, Debug)]
pub struct Padding { pub struct Padding {
pad_left: usize, pad_left: usize,
pad_right: usize, pad_right: usize,
pad_top: usize, pad_top: usize,
pad_bottom: usize, pad_bottom: usize,
} }
#[derive(Config)] #[derive(Config)]
pub struct MidConfig { pub struct MidConfig {
n_channel: usize, n_channel: usize,
} }
impl MidConfig { impl MidConfig {
@@ -388,18 +430,18 @@ impl MidConfig {
let block_2 = ResnetBlockConfig::new(self.n_channel, self.n_channel).init(); let block_2 = ResnetBlockConfig::new(self.n_channel, self.n_channel).init();
Mid { Mid {
block_1, block_1,
attn, attn,
block_2, block_2,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct Mid<B: Backend> { pub struct Mid<B: Backend> {
block_1: ResnetBlock<B>, block_1: ResnetBlock<B>,
attn: ConvSelfAttentionBlock<B>, attn: ConvSelfAttentionBlock<B>,
block_2: ResnetBlock<B>, block_2: ResnetBlock<B>,
} }
impl<B: Backend> Mid<B> { impl<B: Backend> Mid<B> {
@@ -411,21 +453,24 @@ impl<B: Backend> Mid<B> {
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct ResnetBlockConfig { pub struct ResnetBlockConfig {
in_channels: usize, in_channels: usize,
out_channels: usize, out_channels: usize,
} }
impl ResnetBlockConfig { impl ResnetBlockConfig {
fn init<B: Backend>(&self) -> ResnetBlock<B> { fn init<B: Backend>(&self) -> ResnetBlock<B> {
let norm1 = GroupNormConfig::new(32, self.in_channels).init(); let norm1 = GroupNormConfig::new(32, self.in_channels).init();
let conv1 = Conv2dConfig::new([self.in_channels, self.out_channels], [3, 3]).with_padding(PaddingConfig2d::Explicit(1, 1)).init(); let conv1 = Conv2dConfig::new([self.in_channels, self.out_channels], [3, 3])
.with_padding(PaddingConfig2d::Explicit(1, 1))
.init();
let norm2 = GroupNormConfig::new(32, self.out_channels).init(); let norm2 = GroupNormConfig::new(32, self.out_channels).init();
let conv2 = Conv2dConfig::new([self.out_channels, self.out_channels], [3, 3]).with_padding(PaddingConfig2d::Explicit(1, 1)).init(); let conv2 = Conv2dConfig::new([self.out_channels, self.out_channels], [3, 3])
.with_padding(PaddingConfig2d::Explicit(1, 1))
.init();
let nin_shortcut = if self.in_channels != self.out_channels { let nin_shortcut = if self.in_channels != self.out_channels {
Some( Conv2dConfig::new([self.in_channels, self.out_channels], [1, 1]).init() ) Some(Conv2dConfig::new([self.in_channels, self.out_channels], [1, 1]).init())
} else { } else {
None None
}; };
@@ -434,34 +479,37 @@ impl ResnetBlockConfig {
let silu2 = SILU::new(); let silu2 = SILU::new();
ResnetBlock { ResnetBlock {
norm1, norm1,
silu1, silu1,
conv1, conv1,
norm2, norm2,
silu2, silu2,
conv2, conv2,
nin_shortcut, nin_shortcut,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct ResnetBlock<B: Backend> { pub struct ResnetBlock<B: Backend> {
norm1: GroupNorm<B>, norm1: GroupNorm<B>,
silu1: SILU, silu1: SILU,
conv1: Conv2d<B>, conv1: Conv2d<B>,
norm2: GroupNorm<B>, norm2: GroupNorm<B>,
silu2: SILU, silu2: SILU,
conv2: Conv2d<B>, conv2: Conv2d<B>,
nin_shortcut: Option<Conv2d<B>>, nin_shortcut: Option<Conv2d<B>>,
} }
impl<B: Backend> ResnetBlock<B> { impl<B: Backend> ResnetBlock<B> {
fn forward(&self, x: Tensor<B, 4>) -> Tensor<B, 4> { fn forward(&self, x: Tensor<B, 4>) -> Tensor<B, 4> {
let h = self.conv1.forward( self.silu1.forward(self.norm1.forward(x.clone())) ); let h = self
let h = self.conv2.forward( self.silu2.forward(self.norm2.forward(h)) ); .conv1
.forward(self.silu1.forward(self.norm1.forward(x.clone())));
let h = self
.conv2
.forward(self.silu2.forward(self.norm2.forward(h)));
if let Some(ns) = self.nin_shortcut.as_ref() { if let Some(ns) = self.nin_shortcut.as_ref() {
ns.forward(x) + h ns.forward(x) + h
} else { } else {
@@ -472,7 +520,7 @@ impl<B: Backend> ResnetBlock<B> {
#[derive(Config)] #[derive(Config)]
pub struct ConvSelfAttentionBlockConfig { pub struct ConvSelfAttentionBlockConfig {
n_channel: usize, n_channel: usize,
} }
impl ConvSelfAttentionBlockConfig { impl ConvSelfAttentionBlockConfig {
@@ -484,22 +532,22 @@ impl ConvSelfAttentionBlockConfig {
let proj_out = Conv2dConfig::new([self.n_channel, self.n_channel], [1, 1]).init(); let proj_out = Conv2dConfig::new([self.n_channel, self.n_channel], [1, 1]).init();
ConvSelfAttentionBlock { ConvSelfAttentionBlock {
norm, norm,
q, q,
k, k,
v, v,
proj_out, proj_out,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct ConvSelfAttentionBlock<B: Backend> { pub struct ConvSelfAttentionBlock<B: Backend> {
norm: GroupNorm<B>, norm: GroupNorm<B>,
q: Conv2d<B>, q: Conv2d<B>,
k: Conv2d<B>, k: Conv2d<B>,
v: Conv2d<B>, v: Conv2d<B>,
proj_out: Conv2d<B>, proj_out: Conv2d<B>,
} }
impl<B: Backend> ConvSelfAttentionBlock<B> { impl<B: Backend> ConvSelfAttentionBlock<B> {
@@ -508,9 +556,21 @@ impl<B: Backend> ConvSelfAttentionBlock<B> {
let h = self.norm.forward(x.clone()); let h = self.norm.forward(x.clone());
let q = self.q.forward(h.clone()).reshape([n_batch, n_channel, height * width]).swap_dims(1, 2); let q = self
let k = self.k.forward(h.clone()).reshape([n_batch, n_channel, height * width]).swap_dims(1, 2); .q
let v = self.v.forward(h).reshape([n_batch, n_channel, height * width]).swap_dims(1, 2); .forward(h.clone())
.reshape([n_batch, n_channel, height * width])
.swap_dims(1, 2);
let k = self
.k
.forward(h.clone())
.reshape([n_batch, n_channel, height * width])
.swap_dims(1, 2);
let v = self
.v
.forward(h)
.reshape([n_batch, n_channel, height * width])
.swap_dims(1, 2);
let wv = qkv_attention(q, k, v, None, 1) let wv = qkv_attention(q, k, v, None, 1)
.swap_dims(1, 2) .swap_dims(1, 2)

View File

@@ -1,14 +1,11 @@
use std::error::Error;
use burn::tensor::ElementConversion; use burn::tensor::ElementConversion;
use std::error::Error;
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
nn, nn,
tensor::{ tensor::{backend::Backend, Tensor},
backend::Backend,
Tensor,
},
}; };
use super::*; use super::*;
@@ -28,7 +25,10 @@ pub fn load_mlp<B: Backend>(path: &str, device: &B::Device) -> Result<MLP<B>, Bo
Ok(mlp) Ok(mlp)
} }
pub fn load_multi_head_self_attention<B: Backend>(path: &str, device: &B::Device) -> Result<MultiHeadSelfAttention<B>, Box<dyn Error>> { pub fn load_multi_head_self_attention<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<MultiHeadSelfAttention<B>, Box<dyn Error>> {
let n_head = load_usize::<B>("n_head", path, device)?; let n_head = load_usize::<B>("n_head", path, device)?;
let query = load_linear(&format!("{}/{}", path, "query"), device)?; let query = load_linear(&format!("{}/{}", path, "query"), device)?;
let key = load_linear(&format!("{}/{}", path, "key"), device)?; let key = load_linear(&format!("{}/{}", path, "key"), device)?;
@@ -46,7 +46,10 @@ pub fn load_multi_head_self_attention<B: Backend>(path: &str, device: &B::Device
Ok(mhsa) Ok(mhsa)
} }
pub fn load_residual_decoder_attention_block<B: Backend>(path: &str, device: &B::Device) -> Result<ResidualDecoderAttentionBlock<B>, Box<dyn Error>> { pub fn load_residual_decoder_attention_block<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<ResidualDecoderAttentionBlock<B>, Box<dyn Error>> {
let mlp = load_mlp(&format!("{}/{}", path, "mlp"), device)?; let mlp = load_mlp(&format!("{}/{}", path, "mlp"), device)?;
let attn = load_multi_head_self_attention(&format!("{}/{}", path, "attn"), device)?; let attn = load_multi_head_self_attention(&format!("{}/{}", path, "attn"), device)?;
let attn_ln = load_layer_norm(&format!("{}/{}", path, "attn_ln"), device)?; let attn_ln = load_layer_norm(&format!("{}/{}", path, "attn_ln"), device)?;
@@ -64,15 +67,17 @@ pub fn load_residual_decoder_attention_block<B: Backend>(path: &str, device: &B:
pub fn load_clip<B: Backend>(path: &str, device: &B::Device) -> Result<CLIP<B>, Box<dyn Error>> { pub fn load_clip<B: Backend>(path: &str, device: &B::Device) -> Result<CLIP<B>, Box<dyn Error>> {
let token_embedding = load_embedding(&format!("{}/{}", path, "token_embedding"), device)?; let token_embedding = load_embedding(&format!("{}/{}", path, "token_embedding"), device)?;
let position_embedding = load_tensor("weight", &format!("{}/position_embedding", path), device)?.into(); let position_embedding =
load_tensor("weight", &format!("{}/position_embedding", path), device)?.into();
let n_layer = load_usize::<B>("n_layer", path, device)?; let n_layer = load_usize::<B>("n_layer", path, device)?;
let mut blocks = (0..n_layer) let mut blocks = (0..n_layer)
.into_iter() .into_iter()
.map(|i| { .map(|i| {
load_residual_decoder_attention_block::<B>(&format!("{}/blocks/{}", path, i), device) load_residual_decoder_attention_block::<B>(&format!("{}/blocks/{}", path, i), device)
}).collect::<Result<Vec<_>, _>>()?; })
.collect::<Result<Vec<_>, _>>()?;
let layer_norm = load_layer_norm(&format!("{}/{}", path, "layer_norm"), device)?; let layer_norm = load_layer_norm(&format!("{}/{}", path, "layer_norm"), device)?;
let clip = CLIP { let clip = CLIP {
@@ -81,6 +86,6 @@ pub fn load_clip<B: Backend>(path: &str, device: &B::Device) -> Result<CLIP<B>,
blocks: blocks, blocks: blocks,
layer_norm: layer_norm, layer_norm: layer_norm,
}; };
Ok(clip) Ok(clip)
} }

View File

@@ -1,35 +1,33 @@
pub mod load; pub mod load;
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
nn, nn,
tensor::{ tensor::{
activation::{sigmoid, softmax},
backend::Backend, backend::Backend,
activation::{softmax, sigmoid}, module::embedding,
module::embedding, Distribution, Int, Tensor,
Tensor,
Distribution,
Int,
}, },
}; };
use crate::model::attention::{qkv_attention, attn_decoder_mask}; use crate::model::attention::{attn_decoder_mask, qkv_attention};
#[derive(Config)] #[derive(Config)]
pub struct CLIPConfig { pub struct CLIPConfig {
n_vocab: usize, n_vocab: usize,
n_state: usize, n_state: usize,
n_head: usize, n_head: usize,
n_ctx: usize, n_ctx: usize,
n_layer: usize, n_layer: usize,
} }
impl CLIPConfig { impl CLIPConfig {
pub fn init<B: Backend>(&self) -> CLIP<B> { pub fn init<B: Backend>(&self) -> CLIP<B> {
let token_embedding = nn::EmbeddingConfig::new(self.n_vocab, self.n_state).init(); let token_embedding = nn::EmbeddingConfig::new(self.n_vocab, self.n_state).init();
let position_embedding = Tensor::random([self.n_ctx, self.n_state], Distribution::Normal(0.0, 1.0)).into(); let position_embedding =
Tensor::random([self.n_ctx, self.n_state], Distribution::Normal(0.0, 1.0)).into();
let blocks = (0..self.n_layer) let blocks = (0..self.n_layer)
.into_iter() .into_iter()
.map(|_| ResidualDecoderAttentionBlockConfig::new(self.n_state, self.n_head).init()) .map(|_| ResidualDecoderAttentionBlockConfig::new(self.n_state, self.n_head).init())
@@ -37,33 +35,35 @@ impl CLIPConfig {
let layer_norm = nn::LayerNormConfig::new(self.n_state).init(); let layer_norm = nn::LayerNormConfig::new(self.n_state).init();
CLIP { CLIP {
token_embedding, token_embedding,
position_embedding, position_embedding,
blocks, blocks,
layer_norm, layer_norm,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct CLIP<B: Backend> { pub struct CLIP<B: Backend> {
token_embedding: nn::Embedding<B>, token_embedding: nn::Embedding<B>,
position_embedding: Param<Tensor<B, 2>>, position_embedding: Param<Tensor<B, 2>>,
blocks: Vec<ResidualDecoderAttentionBlock<B>>, blocks: Vec<ResidualDecoderAttentionBlock<B>>,
layer_norm: nn::LayerNorm<B>, layer_norm: nn::LayerNorm<B>,
} }
impl<B: Backend> CLIP<B> { impl<B: Backend> CLIP<B> {
pub fn forward(&self, x: Tensor<B, 2, Int>) -> Tensor<B, 3> { pub fn forward(&self, x: Tensor<B, 2, Int>) -> Tensor<B, 3> {
let [n_batch, seq_len] = x.dims(); let [n_batch, seq_len] = x.dims();
let mask = attn_decoder_mask(seq_len, &x.device()); let mask = attn_decoder_mask(seq_len, &x.device());
let embedded = self.token_embedding.forward(x) let embedded = self.token_embedding.forward(x)
+ self.position_embedding.val().slice([0..seq_len]).unsqueeze(); + self
.position_embedding
.val()
.slice([0..seq_len])
.unsqueeze();
let mut x = embedded; let mut x = embedded;
for block in &self.blocks { for block in &self.blocks {
x = block.forward(x, mask.clone()); x = block.forward(x, mask.clone());
@@ -73,37 +73,35 @@ impl<B: Backend> CLIP<B> {
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct ResidualDecoderAttentionBlockConfig { pub struct ResidualDecoderAttentionBlockConfig {
n_state: usize, n_state: usize,
n_head: usize, n_head: usize,
} }
impl ResidualDecoderAttentionBlockConfig { impl ResidualDecoderAttentionBlockConfig {
pub fn init<B: Backend>(&self) -> ResidualDecoderAttentionBlock<B> { pub fn init<B: Backend>(&self) -> ResidualDecoderAttentionBlock<B> {
let attn = MultiHeadSelfAttentionConfig::new(self.n_state, self.n_head).init(); let attn = MultiHeadSelfAttentionConfig::new(self.n_state, self.n_head).init();
let attn_ln = nn::LayerNormConfig::new(self.n_state).init(); let attn_ln = nn::LayerNormConfig::new(self.n_state).init();
let mlp = MLPConfig::new(self.n_state, 4 * self.n_state).init(); let mlp = MLPConfig::new(self.n_state, 4 * self.n_state).init();
let mlp_ln = nn::LayerNormConfig::new(self.n_state).init(); let mlp_ln = nn::LayerNormConfig::new(self.n_state).init();
ResidualDecoderAttentionBlock { ResidualDecoderAttentionBlock {
attn, attn,
attn_ln, attn_ln,
mlp, mlp,
mlp_ln, mlp_ln,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct ResidualDecoderAttentionBlock<B: Backend> { pub struct ResidualDecoderAttentionBlock<B: Backend> {
attn: MultiHeadSelfAttention<B>, attn: MultiHeadSelfAttention<B>,
attn_ln: nn::LayerNorm<B>, attn_ln: nn::LayerNorm<B>,
mlp: MLP<B>, mlp: MLP<B>,
mlp_ln: nn::LayerNorm<B>, mlp_ln: nn::LayerNorm<B>,
} }
impl<B: Backend> ResidualDecoderAttentionBlock<B> { impl<B: Backend> ResidualDecoderAttentionBlock<B> {
@@ -117,12 +115,17 @@ impl<B: Backend> ResidualDecoderAttentionBlock<B> {
#[derive(Config)] #[derive(Config)]
pub struct MultiHeadSelfAttentionConfig { pub struct MultiHeadSelfAttentionConfig {
n_state: usize, n_state: usize,
n_head: usize, n_head: usize,
} }
impl MultiHeadSelfAttentionConfig { impl MultiHeadSelfAttentionConfig {
fn init<B: Backend>(&self) -> MultiHeadSelfAttention<B> { fn init<B: Backend>(&self) -> MultiHeadSelfAttention<B> {
assert!(self.n_state % self.n_head == 0, "State size {} must be a multiple of head size {}", self.n_state, self.n_head); assert!(
self.n_state % self.n_head == 0,
"State size {} must be a multiple of head size {}",
self.n_state,
self.n_head
);
let n_head = self.n_head; let n_head = self.n_head;
let query = nn::LinearConfig::new(self.n_state, self.n_state).init(); let query = nn::LinearConfig::new(self.n_state, self.n_state).init();
@@ -130,23 +133,23 @@ impl MultiHeadSelfAttentionConfig {
let value = nn::LinearConfig::new(self.n_state, self.n_state).init(); let value = nn::LinearConfig::new(self.n_state, self.n_state).init();
let out = nn::LinearConfig::new(self.n_state, self.n_state).init(); let out = nn::LinearConfig::new(self.n_state, self.n_state).init();
MultiHeadSelfAttention { MultiHeadSelfAttention {
n_head, n_head,
query, query,
key, key,
value, value,
out out,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct MultiHeadSelfAttention<B: Backend> { pub struct MultiHeadSelfAttention<B: Backend> {
n_head: usize, n_head: usize,
query: nn::Linear<B>, query: nn::Linear<B>,
key: nn::Linear<B>, key: nn::Linear<B>,
value: nn::Linear<B>, value: nn::Linear<B>,
out: nn::Linear<B>, out: nn::Linear<B>,
} }
impl<B: Backend> MultiHeadSelfAttention<B> { impl<B: Backend> MultiHeadSelfAttention<B> {
@@ -161,17 +164,10 @@ impl<B: Backend> MultiHeadSelfAttention<B> {
} }
} }
#[derive(Config, Debug)] #[derive(Config, Debug)]
pub struct MLPConfig { pub struct MLPConfig {
input_size: usize, input_size: usize,
hidden_size: usize, hidden_size: usize,
} }
impl MLPConfig { impl MLPConfig {
@@ -180,19 +176,15 @@ impl MLPConfig {
let gelu = QuickGELU::new(); let gelu = QuickGELU::new();
let fc2 = nn::LinearConfig::new(self.hidden_size, self.input_size).init(); let fc2 = nn::LinearConfig::new(self.hidden_size, self.input_size).init();
MLP { MLP { fc1, gelu, fc2 }
fc1,
gelu,
fc2,
}
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct MLP<B: Backend> { pub struct MLP<B: Backend> {
fc1: nn::Linear<B>, fc1: nn::Linear<B>,
gelu: QuickGELU, gelu: QuickGELU,
fc2: nn::Linear<B>, fc2: nn::Linear<B>,
} }
impl<B: Backend> MLP<B> { impl<B: Backend> MLP<B> {
@@ -217,4 +209,3 @@ impl QuickGELU {
x.clone() * sigmoid(x * 1.702) x.clone() * sigmoid(x * 1.702)
} }
} }

View File

@@ -4,30 +4,34 @@ use crate::model::load::*;
use std::error::Error; use std::error::Error;
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
nn, nn,
tensor::{ tensor::{backend::Backend, Tensor},
backend::Backend,
Tensor,
},
}; };
pub fn load_group_norm<B: Backend>(path: &str, device: &B::Device) -> Result<GroupNorm<B>, Box<dyn Error>> { pub fn load_group_norm<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<GroupNorm<B>, Box<dyn Error>> {
let n_group = load_usize::<B>("n_group", path, device)?.into(); let n_group = load_usize::<B>("n_group", path, device)?.into();
let n_channel = load_usize::<B>("n_channel", path, device)?.into(); let n_channel = load_usize::<B>("n_channel", path, device)?.into();
let eps = load_f32::<B>("eps", path, device)?.into(); let eps = load_f32::<B>("eps", path, device)?.into();
let gamma = load_tensor::<B, 1>("weight", path, device).ok().unwrap_or_else(|| Tensor::ones_device([n_channel], device)).into(); let gamma = load_tensor::<B, 1>("weight", path, device)
let beta = load_tensor::<B, 1>("bias", path, device).ok().unwrap_or_else(|| Tensor::zeros_device([n_channel], device)).into(); .ok()
.unwrap_or_else(|| Tensor::ones_device([n_channel], device))
.into();
let beta = load_tensor::<B, 1>("bias", path, device)
.ok()
.unwrap_or_else(|| Tensor::zeros_device([n_channel], device))
.into();
Ok( Ok(GroupNorm {
GroupNorm { n_group,
n_group, n_channel,
n_channel, gamma,
gamma, beta,
beta, eps,
eps, })
} }
)
}

View File

@@ -1,25 +1,27 @@
pub mod load; pub mod load;
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
tensor::{ tensor::{backend::Backend, Tensor},
backend::Backend,
Tensor,
},
}; };
#[derive(Config)] #[derive(Config)]
pub struct GroupNormConfig { pub struct GroupNormConfig {
n_group: usize, n_group: usize,
n_channel: usize, n_channel: usize,
#[config(default = 1e-5)] #[config(default = 1e-5)]
eps: f64, eps: f64,
} }
impl GroupNormConfig { impl GroupNormConfig {
pub fn init<B: Backend>(&self) -> GroupNorm<B> { pub fn init<B: Backend>(&self) -> GroupNorm<B> {
assert!(self.n_channel % self.n_group == 0, "The number of channels {} must be divisible by the number of groups {}", self.n_channel, self.n_group); assert!(
self.n_channel % self.n_group == 0,
"The number of channels {} must be divisible by the number of groups {}",
self.n_channel,
self.n_group
);
let n_per_group = self.n_channel / self.n_group; let n_per_group = self.n_channel / self.n_group;
@@ -29,22 +31,22 @@ impl GroupNormConfig {
let eps = self.eps; let eps = self.eps;
GroupNorm { GroupNorm {
n_group: self.n_group, n_group: self.n_group,
n_channel: self.n_channel, n_channel: self.n_channel,
gamma, gamma,
beta, beta,
eps, eps,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct GroupNorm<B: Backend> { pub struct GroupNorm<B: Backend> {
n_group: usize, n_group: usize,
n_channel: usize, n_channel: usize,
gamma: Param<Tensor<B, 1>>, gamma: Param<Tensor<B, 1>>,
beta: Param<Tensor<B, 1>>, beta: Param<Tensor<B, 1>>,
eps: f64, eps: f64,
} }
impl<B: Backend> GroupNorm<B> { impl<B: Backend> GroupNorm<B> {
@@ -56,10 +58,17 @@ impl<B: Backend> GroupNorm<B> {
let mut affine_shape = [1; D]; let mut affine_shape = [1; D];
affine_shape[1] = self.n_channel; affine_shape[1] = self.n_channel;
layernorm( x.reshape([n_batch, self.n_group, num_elements / (n_batch * self.n_group) ]), self.eps ) layernorm(
.reshape(shape) x.reshape([
.mul(self.gamma.val().reshape(affine_shape)) n_batch,
.add(self.beta.val().reshape(affine_shape)) self.n_group,
num_elements / (n_batch * self.n_group),
]),
self.eps,
)
.reshape(shape)
.mul(self.gamma.val().reshape(affine_shape))
.add(self.beta.val().reshape(affine_shape))
} }
} }
@@ -68,5 +77,6 @@ pub fn layernorm<B: Backend, const D: usize>(x: Tensor<B, D>, eps: f64) -> Tenso
//x.sub(mean).div(var.sqrt().add_scalar(eps)) //x.sub(mean).div(var.sqrt().add_scalar(eps))
let u = x.clone() - x.mean_dim(D - 1); let u = x.clone() - x.mean_dim(D - 1);
u.clone().div( (u.clone() * u).mean_dim(D - 1).add_scalar(eps).sqrt() ) u.clone()
} .div((u.clone() * u).mean_dim(D - 1).add_scalar(eps).sqrt())
}

View File

@@ -1,36 +1,38 @@
use std::error::Error;
use std::io::Read;
use npy::{self, NpyData}; use npy::{self, NpyData};
use num_traits::cast::ToPrimitive; use num_traits::cast::ToPrimitive;
use std::error::Error;
use std::io::Read;
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
nn::{self, conv}, nn::{self, conv},
tensor::{ tensor::{backend::Backend, Data, Tensor},
backend::Backend,
Tensor,
Data,
},
}; };
use burn::tensor::ElementConversion; use burn::tensor::ElementConversion;
pub fn numpy_to_tensor<B: Backend, const D: usize>(numpy_data: NpyData<f32>, device: &B::Device) -> Tensor<B, D> { pub fn numpy_to_tensor<B: Backend, const D: usize>(
numpy_data: NpyData<f32>,
device: &B::Device,
) -> Tensor<B, D> {
let mut v = numpy_data.to_vec(); let mut v = numpy_data.to_vec();
let shape: Vec<_> = v[0..D].into_iter().map(|&v| v as usize).collect(); let shape: Vec<_> = v[0..D].into_iter().map(|&v| v as usize).collect();
let data: Vec<B::FloatElem> = v[D..].into_iter().map(|e| e.elem()).collect(); let data: Vec<B::FloatElem> = v[D..].into_iter().map(|e| e.elem()).collect();
Tensor::from_data_device(Data::new(data, shape.into()), device) Tensor::from_data_device(Data::new(data, shape.into()), device)
} }
pub fn load_tensor<B: Backend, const D: usize>(name: &str, path: &str, device: &B::Device) -> Result<Tensor<B, D>, Box<dyn Error>> { pub fn load_tensor<B: Backend, const D: usize>(
name: &str,
path: &str,
device: &B::Device,
) -> Result<Tensor<B, D>, Box<dyn Error>> {
let tensor_path = format!("{}/{}.npy", path, name); let tensor_path = format!("{}/{}.npy", path, name);
let mut buf = vec![]; let mut buf = vec![];
std::fs::File::open(&tensor_path)? std::fs::File::open(&tensor_path)?.read_to_end(&mut buf)?;
.read_to_end(&mut buf)?;
let tensor_numpy: NpyData<f32> = NpyData::from_bytes(&buf)?; let tensor_numpy: NpyData<f32> = NpyData::from_bytes(&buf)?;
@@ -41,15 +43,26 @@ pub fn load_tensor<B: Backend, const D: usize>(name: &str, path: &str, device: &
Ok(tensor) Ok(tensor)
} }
pub fn load_f32<B: Backend>(name: &str, path: &str, device: &B::Device) -> Result<f32, Box<dyn Error>> { pub fn load_f32<B: Backend>(
name: &str,
path: &str,
device: &B::Device,
) -> Result<f32, Box<dyn Error>> {
load_tensor::<B, 1>(name, path, device).map(|t| t.into_scalar().to_f32().unwrap()) load_tensor::<B, 1>(name, path, device).map(|t| t.into_scalar().to_f32().unwrap())
} }
pub fn load_usize<B: Backend>(name: &str, path: &str, device: &B::Device) -> Result<usize, Box<dyn Error>> { pub fn load_usize<B: Backend>(
name: &str,
path: &str,
device: &B::Device,
) -> Result<usize, Box<dyn Error>> {
load_tensor::<B, 1>(name, path, device).map(|t| t.into_scalar().to_usize().unwrap()) load_tensor::<B, 1>(name, path, device).map(|t| t.into_scalar().to_usize().unwrap())
} }
pub fn load_linear<B: Backend>(path: &str, device: &B::Device) -> Result<nn::Linear<B>, Box<dyn Error>> { pub fn load_linear<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<nn::Linear<B>, Box<dyn Error>> {
let weight = load_tensor::<B, 2>("weight", path, device)?; let weight = load_tensor::<B, 2>("weight", path, device)?;
let bias = load_tensor::<B, 1>("bias", path, device).ok(); let bias = load_tensor::<B, 1>("bias", path, device).ok();
@@ -62,7 +75,10 @@ pub fn load_linear<B: Backend>(path: &str, device: &B::Device) -> Result<nn::Lin
Ok(linear) Ok(linear)
} }
pub fn load_embedding<B: Backend>(path: &str, device: &B::Device) -> Result<nn::Embedding<B>, Box<dyn Error>> { pub fn load_embedding<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<nn::Embedding<B>, Box<dyn Error>> {
let weight = load_tensor::<B, 2>("weight", path, device)?; let weight = load_tensor::<B, 2>("weight", path, device)?;
let [n_vocab, n_state] = weight.dims(); let [n_vocab, n_state] = weight.dims();
@@ -74,7 +90,10 @@ pub fn load_embedding<B: Backend>(path: &str, device: &B::Device) -> Result<nn::
Ok(embedding) Ok(embedding)
} }
pub fn load_layer_norm<B: Backend>(path: &str, device: &B::Device) -> Result<nn::LayerNorm<B>, Box<dyn Error>> { pub fn load_layer_norm<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<nn::LayerNorm<B>, Box<dyn Error>> {
let weight = load_tensor::<B, 1>("weight", path, device)?; let weight = load_tensor::<B, 1>("weight", path, device)?;
let bias = load_tensor::<B, 1>("bias", path, device)?; let bias = load_tensor::<B, 1>("bias", path, device)?;
let eps = load_f32::<B>("eps", path, device)? as f64; let eps = load_f32::<B>("eps", path, device)? as f64;
@@ -84,7 +103,7 @@ pub fn load_layer_norm<B: Backend>(path: &str, device: &B::Device) -> Result<nn:
let record = nn::LayerNormRecord { let record = nn::LayerNormRecord {
gamma: weight.into(), gamma: weight.into(),
beta: bias.into(), beta: bias.into(),
epsilon: <f64 as Module<B>>::into_record(eps), epsilon: <f64 as Module<B>>::into_record(eps),
}; };
let layer_norm: nn::LayerNorm<B> = nn::LayerNormConfig::new(n_state).init_with(record); let layer_norm: nn::LayerNorm<B> = nn::LayerNormConfig::new(n_state).init_with(record);
@@ -92,20 +111,22 @@ pub fn load_layer_norm<B: Backend>(path: &str, device: &B::Device) -> Result<nn:
Ok(layer_norm) Ok(layer_norm)
} }
/*pub fn load_rmsnorm<B: Backend>(path: &str, device: &B::Device) -> Result<RMSNorm<B>, Box<dyn Error>> { /*pub fn load_rmsnorm<B: Backend>(path: &str, device: &B::Device) -> Result<RMSNorm<B>, Box<dyn Error>> {
let weight = load_tensor::<B, 1>("weight", path, device)?; let weight = load_tensor::<B, 1>("weight", path, device)?;
let eps = load_f32::<B>("eps", path, device)?.into(); let eps = load_f32::<B>("eps", path, device)?.into();
let rmsnorm = RMSNorm { let rmsnorm = RMSNorm {
weight: weight.into(), weight: weight.into(),
eps: eps eps: eps
}; };
Ok(rmsnorm) Ok(rmsnorm)
}*/ }*/
pub fn load_conv2d<B: Backend>(path: &str, device: &B::Device) -> Result<conv::Conv2d<B>, Box<dyn Error>> { pub fn load_conv2d<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<conv::Conv2d<B>, Box<dyn Error>> {
let weight = load_tensor::<B, 4>("weight", path, device)?; let weight = load_tensor::<B, 4>("weight", path, device)?;
let bias = load_tensor::<B, 1>("bias", path, device).ok(); let bias = load_tensor::<B, 1>("bias", path, device).ok();
let has_bias = bias.is_some(); let has_bias = bias.is_some();
@@ -127,24 +148,24 @@ pub fn load_conv2d<B: Backend>(path: &str, device: &B::Device) -> Result<conv::C
let padding = tensor_to_array_2(padding); let padding = tensor_to_array_2(padding);
let padding = nn::PaddingConfig2d::Explicit(padding[0], padding[1]); let padding = nn::PaddingConfig2d::Explicit(padding[0], padding[1]);
let record = conv::Conv2dRecord { let record = conv::Conv2dRecord {
weight: weight.into(), weight: weight.into(),
bias: bias.map(|t| t.into()), bias: bias.map(|t| t.into()),
stride: <[usize; 2] as Module<B>>::into_record(stride), stride: <[usize; 2] as Module<B>>::into_record(stride),
kernel_size: <[usize; 2] as Module<B>>::into_record(kernel_size), kernel_size: <[usize; 2] as Module<B>>::into_record(kernel_size),
dilation: <[usize; 2] as Module<B>>::into_record(dilation), dilation: <[usize; 2] as Module<B>>::into_record(dilation),
groups: <usize as Module<B>>::into_record(n_group), groups: <usize as Module<B>>::into_record(n_group),
padding: <nn::PaddingConfig2d as Module<B>>::into_record(padding.clone()), padding: <nn::PaddingConfig2d as Module<B>>::into_record(padding.clone()),
}; };
let conv2d: conv::Conv2d<B> = conv::Conv2dConfig::new([n_channels_in, n_channels_out], kernel_size) let conv2d: conv::Conv2d<B> =
.with_stride(stride) conv::Conv2dConfig::new([n_channels_in, n_channels_out], kernel_size)
.with_dilation(dilation) .with_stride(stride)
.with_groups(n_group) .with_dilation(dilation)
.with_padding(padding) .with_groups(n_group)
.with_bias(has_bias) .with_padding(padding)
.init_with(record); .with_bias(has_bias)
.init_with(record);
Ok(conv2d) Ok(conv2d)
} }
@@ -164,4 +185,4 @@ pub fn tensor_to_array<const N: usize, B: Backend>(x: Tensor<B, 1>) -> [usize; N
} }
arr arr
} }

View File

@@ -1,11 +1,11 @@
pub mod stablediffusion; pub mod stablediffusion;
pub mod autoencoder; pub mod autoencoder;
pub mod unet;
pub mod clip; pub mod clip;
pub mod unet;
pub mod silu;
pub mod groupnorm;
pub mod attention; pub mod attention;
pub mod groupnorm;
pub mod silu;
pub mod load; pub mod load;

View File

@@ -1,13 +1,8 @@
use burn::{ use burn::{
module::Module, module::Module,
tensor::{ tensor::{activation::sigmoid, backend::Backend, Tensor},
backend::Backend,
activation::sigmoid,
Tensor,
},
}; };
#[derive(Module, Clone, Debug)] #[derive(Module, Clone, Debug)]
pub struct SILU {} pub struct SILU {}
@@ -19,4 +14,4 @@ impl SILU {
pub fn forward<B: Backend, const D: usize>(&self, x: Tensor<B, D>) -> Tensor<B, D> { pub fn forward<B: Backend, const D: usize>(&self, x: Tensor<B, D>) -> Tensor<B, D> {
x.clone() * sigmoid(x) x.clone() * sigmoid(x)
} }
} }

View File

@@ -1,20 +1,22 @@
use std::error::Error;
use burn::tensor::ElementConversion; use burn::tensor::ElementConversion;
use std::error::Error;
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
nn, nn,
tensor::{ tensor::{backend::Backend, Tensor},
backend::Backend,
Tensor,
},
}; };
use super::*; use super::*;
use crate::model::{load::*, autoencoder::load::load_autoencoder, unet::load::load_unet, clip::load::load_clip}; use crate::model::{
autoencoder::load::load_autoencoder, clip::load::load_clip, load::*, unet::load::load_unet,
};
pub fn load_stable_diffusion<B: Backend>(path: &str, device: &B::Device) -> Result<StableDiffusion<B>, Box<dyn Error>> { pub fn load_stable_diffusion<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<StableDiffusion<B>, Box<dyn Error>> {
let n_steps = load_usize::<B>("n_steps", path, device)?; let n_steps = load_usize::<B>("n_steps", path, device)?;
let alpha_cumulative_products = load_tensor::<B, 1>("alphas_cumprod", path, device)?.into(); let alpha_cumulative_products = load_tensor::<B, 1>("alphas_cumprod", path, device)?.into();
let autoencoder = load_autoencoder(&format!("{}/{}", path, "autoencoder"), device)?; let autoencoder = load_autoencoder(&format!("{}/{}", path, "autoencoder"), device)?;
@@ -22,11 +24,10 @@ pub fn load_stable_diffusion<B: Backend>(path: &str, device: &B::Device) -> Resu
let clip = load_clip(&format!("{}/{}", path, "clip"), device)?; let clip = load_clip(&format!("{}/{}", path, "clip"), device)?;
Ok(StableDiffusion { Ok(StableDiffusion {
n_steps, n_steps,
alpha_cumulative_products, alpha_cumulative_products,
autoencoder, autoencoder,
diffusion, diffusion,
clip, clip,
}) })
} }

View File

@@ -1,30 +1,20 @@
pub mod load; pub mod load;
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
tensor::{ tensor::{backend::Backend, BasicOps, Data, Distribution, Float, Int, Tensor},
backend::Backend,
Tensor,
Int,
Float,
BasicOps,
Data,
Distribution,
},
}; };
use num_traits::ToPrimitive; use num_traits::ToPrimitive;
use super::autoencoder::{Autoencoder, AutoencoderConfig}; use super::autoencoder::{Autoencoder, AutoencoderConfig};
use super::clip::{CLIPConfig, CLIP};
use super::unet::{UNet, UNetConfig}; use super::unet::{UNet, UNetConfig};
use super::clip::{CLIP, CLIPConfig};
use crate::tokenizer::SimpleTokenizer; use crate::tokenizer::SimpleTokenizer;
#[derive(Config)] #[derive(Config)]
pub struct StableDiffusionConfig { pub struct StableDiffusionConfig {}
}
impl StableDiffusionConfig { impl StableDiffusionConfig {
pub fn init<B: Backend>(&self) -> StableDiffusion<B> { pub fn init<B: Backend>(&self) -> StableDiffusion<B> {
@@ -36,29 +26,40 @@ impl StableDiffusionConfig {
let clip = CLIPConfig::new(49408, 768, 12, 77, 12).init(); let clip = CLIPConfig::new(49408, 768, 12, 77, 12).init();
StableDiffusion { StableDiffusion {
n_steps, n_steps,
alpha_cumulative_products, alpha_cumulative_products,
autoencoder, autoencoder,
diffusion, diffusion,
clip, clip,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct StableDiffusion<B: Backend> { pub struct StableDiffusion<B: Backend> {
n_steps: usize, n_steps: usize,
alpha_cumulative_products: Param<Tensor<B, 1>>, alpha_cumulative_products: Param<Tensor<B, 1>>,
autoencoder: Autoencoder<B>, autoencoder: Autoencoder<B>,
diffusion: UNet<B>, diffusion: UNet<B>,
clip: CLIP<B>, clip: CLIP<B>,
} }
impl<B: Backend> StableDiffusion<B> { impl<B: Backend> StableDiffusion<B> {
pub fn sample_image(&self, context: Tensor<B, 3>, unconditional_context: Tensor<B, 2>, unconditional_guidance_scale: f64, n_steps: usize) -> Vec<Vec<u8>> { pub fn sample_image(
&self,
context: Tensor<B, 3>,
unconditional_context: Tensor<B, 2>,
unconditional_guidance_scale: f64,
n_steps: usize,
) -> Vec<Vec<u8>> {
let [n_batch, _, _] = context.dims(); let [n_batch, _, _] = context.dims();
let latent = self.sample_latent(context, unconditional_context, unconditional_guidance_scale, n_steps); let latent = self.sample_latent(
context,
unconditional_context,
unconditional_guidance_scale,
n_steps,
);
self.latent_to_image(latent) self.latent_to_image(latent)
} }
@@ -71,7 +72,7 @@ impl<B: Backend> StableDiffusion<B> {
let width = 512; let width = 512;
let num_elements_per_image = n_channel * height * width; let num_elements_per_image = n_channel * height * width;
// correct size and scale and reorder to // correct size and scale and reorder to
let image = (image + 1.0) / 2.0; let image = (image + 1.0) / 2.0;
let image = image let image = image
.reshape([n_batch, n_channel, height, width]) .reshape([n_batch, n_channel, height, width])
@@ -79,19 +80,29 @@ impl<B: Backend> StableDiffusion<B> {
.swap_dims(2, 3) .swap_dims(2, 3)
.mul_scalar(255.0); .mul_scalar(255.0);
let flattened: Vec<_> = image. let flattened: Vec<_> = image.into_data().value;
into_data().
value;
(0..n_batch).into_iter().map(|b| { (0..n_batch)
let start = b * num_elements_per_image; .into_iter()
let end = start + num_elements_per_image; .map(|b| {
let start = b * num_elements_per_image;
let end = start + num_elements_per_image;
flattened[start..end].into_iter().map(|v| v.to_f64().unwrap().min(255.0).max(0.0).to_u8().unwrap()).collect() flattened[start..end]
}).collect() .into_iter()
.map(|v| v.to_f64().unwrap().min(255.0).max(0.0).to_u8().unwrap())
.collect()
})
.collect()
} }
pub fn sample_latent(&self, context: Tensor<B, 3>, unconditional_context: Tensor<B, 2>, unconditional_guidance_scale: f64, n_steps: usize) -> Tensor<B, 4> { pub fn sample_latent(
&self,
context: Tensor<B, 3>,
unconditional_context: Tensor<B, 2>,
unconditional_guidance_scale: f64,
n_steps: usize,
) -> Tensor<B, 4> {
let device = context.device(); let device = context.device();
let step_size = self.n_steps / n_steps; let step_size = self.n_steps / n_steps;
@@ -99,7 +110,8 @@ impl<B: Backend> StableDiffusion<B> {
let [n_batches, _, _] = context.dims(); let [n_batches, _, _] = context.dims();
let gen_noise = || { let gen_noise = || {
Tensor::random([n_batches, 4, 64, 64], Distribution::Normal(0.0, 1.0)).to_device(&device) Tensor::random([n_batches, 4, 64, 64], Distribution::Normal(0.0, 1.0))
.to_device(&device)
}; };
let sigma = 0.0; // Use deterministic diffusion let sigma = 0.0; // Use deterministic diffusion
@@ -107,10 +119,21 @@ impl<B: Backend> StableDiffusion<B> {
let mut latent = gen_noise(); let mut latent = gen_noise();
for t in (0..self.n_steps).rev().step_by(step_size) { for t in (0..self.n_steps).rev().step_by(step_size) {
let current_alpha: f64 = self.alpha_cumulative_products.val().slice([t..t + 1]).into_scalar().to_f64().unwrap(); let current_alpha: f64 = self
.alpha_cumulative_products
.val()
.slice([t..t + 1])
.into_scalar()
.to_f64()
.unwrap();
let prev_alpha: f64 = if t >= step_size { let prev_alpha: f64 = if t >= step_size {
let i = t - step_size; let i = t - step_size;
self.alpha_cumulative_products.val().slice([i..i + 1]).into_scalar().to_f64().unwrap() self.alpha_cumulative_products
.val()
.slice([i..i + 1])
.into_scalar()
.to_f64()
.unwrap()
} else { } else {
1.0 1.0
}; };
@@ -118,7 +141,13 @@ impl<B: Backend> StableDiffusion<B> {
let sqrt_noise = (1.0 - current_alpha).sqrt(); let sqrt_noise = (1.0 - current_alpha).sqrt();
let timestep = Tensor::from_ints([t as i32]).to_device(&device); let timestep = Tensor::from_ints([t as i32]).to_device(&device);
let pred_noise = self.forward_diffuser(latent.clone(), timestep, context.clone(), unconditional_context.clone(), unconditional_guidance_scale); let pred_noise = self.forward_diffuser(
latent.clone(),
timestep,
context.clone(),
unconditional_context.clone(),
unconditional_guidance_scale,
);
let predx0 = (latent - pred_noise.clone() * sqrt_noise) / current_alpha.sqrt(); let predx0 = (latent - pred_noise.clone() * sqrt_noise) / current_alpha.sqrt();
let dir_latent = pred_noise * (1.0 - prev_alpha - sigma * sigma).sqrt(); let dir_latent = pred_noise * (1.0 - prev_alpha - sigma * sigma).sqrt();
@@ -129,32 +158,36 @@ impl<B: Backend> StableDiffusion<B> {
latent latent
} }
fn forward_diffuser(&self, latent: Tensor<B, 4>, timestep: Tensor<B, 1, Int>, context: Tensor<B, 3>, unconditional_context: Tensor<B, 2>, unconditional_guidance_scale: f64) -> Tensor<B, 4> { fn forward_diffuser(
&self,
latent: Tensor<B, 4>,
timestep: Tensor<B, 1, Int>,
context: Tensor<B, 3>,
unconditional_context: Tensor<B, 2>,
unconditional_guidance_scale: f64,
) -> Tensor<B, 4> {
let [n_batch, _, _, _] = latent.dims(); let [n_batch, _, _, _] = latent.dims();
//let latent = latent.repeat(0, 2); //let latent = latent.repeat(0, 2);
let unconditional_latent = self.diffusion.forward( let unconditional_latent = self.diffusion.forward(
latent.clone(), latent.clone(),
timestep.clone(), timestep.clone(),
unconditional_context.unsqueeze().repeat(0, n_batch) unconditional_context.unsqueeze().repeat(0, n_batch),
); );
let conditional_latent = self.diffusion.forward( let conditional_latent = self.diffusion.forward(latent, timestep, context);
latent,
timestep,
context
);
/*let latent = self.diffusion.forward( /*let latent = self.diffusion.forward(
latent.repeat(0, 2), latent.repeat(0, 2),
timestep.repeat(0, 2), timestep.repeat(0, 2),
Tensor::cat(vec![unconditional_context.unsqueeze::<3>(), context], 0) Tensor::cat(vec![unconditional_context.unsqueeze::<3>(), context], 0)
); );
let unconditional_latent = latent.clone().slice([0..n_batch]); let unconditional_latent = latent.clone().slice([0..n_batch]);
let conditional_latent = latent.slice([n_batch..2 * n_batch]);*/ let conditional_latent = latent.slice([n_batch..2 * n_batch]);*/
unconditional_latent.clone() + (conditional_latent - unconditional_latent) * unconditional_guidance_scale unconditional_latent.clone()
+ (conditional_latent - unconditional_latent) * unconditional_guidance_scale
} }
pub fn unconditional_context(&self, tokenizer: &SimpleTokenizer) -> Tensor<B, 2> { pub fn unconditional_context(&self, tokenizer: &SimpleTokenizer) -> Tensor<B, 2> {
@@ -164,17 +197,25 @@ impl<B: Backend> StableDiffusion<B> {
pub fn context(&self, tokenizer: &SimpleTokenizer, text: &str) -> Tensor<B, 3> { pub fn context(&self, tokenizer: &SimpleTokenizer, text: &str) -> Tensor<B, 3> {
let device = &self.clip.devices()[0]; let device = &self.clip.devices()[0];
let text = format!("<|startoftext|>{}<|endoftext|>", text); let text = format!("<|startoftext|>{}<|endoftext|>", text);
let tokenized: Vec<_> = tokenizer.encode(&text).into_iter().map(|v| v as i32).collect(); let tokenized: Vec<_> = tokenizer
.encode(&text)
.into_iter()
.map(|v| v as i32)
.collect();
self.clip.forward(Tensor::from_ints(&tokenized[..]).to_device(device).unsqueeze()) self.clip.forward(
Tensor::from_ints(&tokenized[..])
.to_device(device)
.unsqueeze(),
)
} }
} }
use crate::helper::to_float;
use std::f64::consts::PI; use std::f64::consts::PI;
fn cosine_schedule<B: Backend>(n_steps: usize) -> Tensor<B, 1> { fn cosine_schedule<B: Backend>(n_steps: usize) -> Tensor<B, 1> {
to_float(Tensor::arange(1..n_steps + 1)) Tensor::arange(1..n_steps + 1)
.float()
.mul_scalar(PI * 0.5 / n_steps as f64) .mul_scalar(PI * 0.5 / n_steps as f64)
.cos() .cos()
} }
@@ -185,12 +226,12 @@ fn offset_cosine_schedule<B: Backend>(n_steps: usize) -> Tensor<B, 1> {
let start_angle = max_signal_rate.acos(); let start_angle = max_signal_rate.acos();
let end_angle = min_signal_rate.acos(); let end_angle = min_signal_rate.acos();
let times = Tensor::arange(1..n_steps + 1); let times = Tensor::arange(1..n_steps + 1).float();
let diffusion_angles = to_float(times) * ( (end_angle - start_angle) / n_steps as f64) + start_angle; let diffusion_angles = times * ((end_angle - start_angle) / n_steps as f64) + start_angle;
diffusion_angles.cos() diffusion_angles.cos()
} }
fn offset_cosine_schedule_cumprod<B: Backend>(n_steps: usize) -> Tensor<B, 1> { fn offset_cosine_schedule_cumprod<B: Backend>(n_steps: usize) -> Tensor<B, 1> {
offset_cosine_schedule::<B>(n_steps).powf(2.0) offset_cosine_schedule::<B>(n_steps).powf(2.0)
} }

View File

@@ -4,19 +4,19 @@ use crate::model::load::*;
use std::error::Error; use std::error::Error;
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
nn, nn,
tensor::{ tensor::{backend::Backend, Tensor},
backend::Backend,
Tensor,
},
}; };
use super::*; use super::*;
use crate::model::groupnorm::load::load_group_norm; use crate::model::groupnorm::load::load_group_norm;
pub fn load_res_block<B: Backend>(path: &str, device: &B::Device) -> Result<ResBlock<B>, Box<dyn Error>> { pub fn load_res_block<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<ResBlock<B>, Box<dyn Error>> {
let norm_in = load_group_norm::<B>(&format!("{}/{}", path, "norm_in"), device)?; let norm_in = load_group_norm::<B>(&format!("{}/{}", path, "norm_in"), device)?;
let conv_in = load_conv2d::<B>(&format!("{}/{}", path, "conv_in"), device)?; let conv_in = load_conv2d::<B>(&format!("{}/{}", path, "conv_in"), device)?;
let lin_embed = load_linear::<B>(&format!("{}/{}", path, "lin_embed"), device)?; let lin_embed = load_linear::<B>(&format!("{}/{}", path, "lin_embed"), device)?;
@@ -26,12 +26,12 @@ pub fn load_res_block<B: Backend>(path: &str, device: &B::Device) -> Result<ResB
let res_block = ResBlock { let res_block = ResBlock {
norm_in: norm_in, norm_in: norm_in,
silu_in: SILU::new(), silu_in: SILU::new(),
conv_in: conv_in, conv_in: conv_in,
silu_embed: SILU::new(), silu_embed: SILU::new(),
lin_embed: lin_embed, lin_embed: lin_embed,
norm_out: norm_out, norm_out: norm_out,
silu_out: SILU::new(), silu_out: SILU::new(),
conv_out: conv_out, conv_out: conv_out,
skip_connection: skip_connection, skip_connection: skip_connection,
}; };
@@ -39,7 +39,10 @@ pub fn load_res_block<B: Backend>(path: &str, device: &B::Device) -> Result<ResB
Ok(res_block) Ok(res_block)
} }
pub fn load_multi_head_attention<B: Backend>(path: &str, device: &B::Device) -> Result<MultiHeadAttention<B>, Box<dyn Error>> { pub fn load_multi_head_attention<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<MultiHeadAttention<B>, Box<dyn Error>> {
let n_head = load_usize::<B>("n_head", path, device)?; let n_head = load_usize::<B>("n_head", path, device)?;
let query = load_linear::<B>(&format!("{}/{}", path, "query"), device)?; let query = load_linear::<B>(&format!("{}/{}", path, "query"), device)?;
let key = load_linear::<B>(&format!("{}/{}", path, "key"), device)?; let key = load_linear::<B>(&format!("{}/{}", path, "key"), device)?;
@@ -53,11 +56,10 @@ pub fn load_multi_head_attention<B: Backend>(path: &str, device: &B::Device) ->
value: value, value: value,
out: out, out: out,
}; };
Ok(multi_head_attention) Ok(multi_head_attention)
} }
pub fn load_geglu<B: Backend>(path: &str, device: &B::Device) -> Result<GEGLU<B>, Box<dyn Error>> { pub fn load_geglu<B: Backend>(path: &str, device: &B::Device) -> Result<GEGLU<B>, Box<dyn Error>> {
let proj = load_linear::<B>(&format!("{}/{}", path, "proj"), device)?; let proj = load_linear::<B>(&format!("{}/{}", path, "proj"), device)?;
@@ -65,11 +67,10 @@ pub fn load_geglu<B: Backend>(path: &str, device: &B::Device) -> Result<GEGLU<B>
proj: proj, proj: proj,
gelu: GELU::new(), // Assuming GELU::new() initializes a new GELU struct gelu: GELU::new(), // Assuming GELU::new() initializes a new GELU struct
}; };
Ok(geglue) Ok(geglue)
} }
pub fn load_mlp<B: Backend>(path: &str, device: &B::Device) -> Result<MLP<B>, Box<dyn Error>> { pub fn load_mlp<B: Backend>(path: &str, device: &B::Device) -> Result<MLP<B>, Box<dyn Error>> {
let geglu = load_geglu::<B>(&format!("{}/{}", path, "geglu"), device)?; let geglu = load_geglu::<B>(&format!("{}/{}", path, "geglu"), device)?;
let lin = load_linear::<B>(&format!("{}/{}", path, "lin"), device)?; let lin = load_linear::<B>(&format!("{}/{}", path, "lin"), device)?;
@@ -78,12 +79,14 @@ pub fn load_mlp<B: Backend>(path: &str, device: &B::Device) -> Result<MLP<B>, Bo
geglu: geglu, geglu: geglu,
lin: lin, lin: lin,
}; };
Ok(mlp) Ok(mlp)
} }
pub fn load_transformer_block<B: Backend>(
pub fn load_transformer_block<B: Backend>(path: &str, device: &B::Device) -> Result<TransformerBlock<B>, Box<dyn Error>> { path: &str,
device: &B::Device,
) -> Result<TransformerBlock<B>, Box<dyn Error>> {
let norm1 = load_layer_norm::<B>(&format!("{}/{}", path, "norm1"), device)?; let norm1 = load_layer_norm::<B>(&format!("{}/{}", path, "norm1"), device)?;
let attn1 = load_multi_head_attention::<B>(&format!("{}/{}", path, "attn1"), device)?; let attn1 = load_multi_head_attention::<B>(&format!("{}/{}", path, "attn1"), device)?;
let norm2 = load_layer_norm::<B>(&format!("{}/{}", path, "norm2"), device)?; let norm2 = load_layer_norm::<B>(&format!("{}/{}", path, "norm2"), device)?;
@@ -99,12 +102,14 @@ pub fn load_transformer_block<B: Backend>(path: &str, device: &B::Device) -> Res
norm3: norm3, norm3: norm3,
mlp: mlp, mlp: mlp,
}; };
Ok(transformer_block) Ok(transformer_block)
} }
pub fn load_spatial_transformer<B: Backend>(
pub fn load_spatial_transformer<B: Backend>(path: &str, device: &B::Device) -> Result<SpatialTransformer<B>, Box<dyn Error>> { path: &str,
device: &B::Device,
) -> Result<SpatialTransformer<B>, Box<dyn Error>> {
let norm = load_group_norm::<B>(&format!("{}/{}", path, "norm"), device)?; let norm = load_group_norm::<B>(&format!("{}/{}", path, "norm"), device)?;
let proj_in = load_conv2d::<B>(&format!("{}/{}", path, "proj_in"), device)?; let proj_in = load_conv2d::<B>(&format!("{}/{}", path, "proj_in"), device)?;
let transformer = load_transformer_block::<B>(&format!("{}/{}", path, "transformer"), device)?; let transformer = load_transformer_block::<B>(&format!("{}/{}", path, "transformer"), device)?;
@@ -116,28 +121,35 @@ pub fn load_spatial_transformer<B: Backend>(path: &str, device: &B::Device) -> R
transformer: transformer, transformer: transformer,
proj_out: proj_out, proj_out: proj_out,
}; };
Ok(spatial_transformer) Ok(spatial_transformer)
} }
pub fn load_upsample<B: Backend>(
pub fn load_upsample<B: Backend>(path: &str, device: &B::Device) -> Result<Upsample<B>, Box<dyn Error>> { path: &str,
device: &B::Device,
) -> Result<Upsample<B>, Box<dyn Error>> {
let conv = load_conv2d::<B>(&format!("{}/{}", path, "conv"), device)?; let conv = load_conv2d::<B>(&format!("{}/{}", path, "conv"), device)?;
let upsample = Upsample { let upsample = Upsample { conv: conv };
conv: conv,
};
Ok(upsample) Ok(upsample)
} }
pub fn load_downsample<B: Backend>(path: &str, device: &B::Device) -> Result<Downsample<B>, Box<dyn Error>> { pub fn load_downsample<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<Downsample<B>, Box<dyn Error>> {
load_conv2d(path, device) load_conv2d(path, device)
} }
pub fn load_res_transformer_res<B: Backend>(path: &str, device: &B::Device) -> Result<ResTransformerRes<B>, Box<dyn Error>> { pub fn load_res_transformer_res<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<ResTransformerRes<B>, Box<dyn Error>> {
let res1 = load_res_block::<B>(&format!("{}/{}", path, "res1"), device)?; // Assuming load_res_block function let res1 = load_res_block::<B>(&format!("{}/{}", path, "res1"), device)?; // Assuming load_res_block function
let transformer = load_spatial_transformer::<B>(&format!("{}/{}", path, "transformer"), device)?; let transformer =
load_spatial_transformer::<B>(&format!("{}/{}", path, "transformer"), device)?;
let res2 = load_res_block::<B>(&format!("{}/{}", path, "res2"), device)?; let res2 = load_res_block::<B>(&format!("{}/{}", path, "res2"), device)?;
let res_transformer_res = ResTransformerRes { let res_transformer_res = ResTransformerRes {
@@ -145,13 +157,17 @@ pub fn load_res_transformer_res<B: Backend>(path: &str, device: &B::Device) -> R
transformer: transformer, transformer: transformer,
res2: res2, res2: res2,
}; };
Ok(res_transformer_res) Ok(res_transformer_res)
} }
pub fn load_res_transformer_upsample<B: Backend>(path: &str, device: &B::Device) -> Result<ResTransformerUpsample<B>, Box<dyn Error>> { pub fn load_res_transformer_upsample<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<ResTransformerUpsample<B>, Box<dyn Error>> {
let res = load_res_block::<B>(&format!("{}/{}", path, "res"), device)?; let res = load_res_block::<B>(&format!("{}/{}", path, "res"), device)?;
let transformer = load_spatial_transformer::<B>(&format!("{}/{}", path, "transformer"), device)?; let transformer =
load_spatial_transformer::<B>(&format!("{}/{}", path, "transformer"), device)?;
let upsample = load_upsample::<B>(&format!("{}/{}", path, "upsample"), device)?; let upsample = load_upsample::<B>(&format!("{}/{}", path, "upsample"), device)?;
let res_transformer_upsample = ResTransformerUpsample { let res_transformer_upsample = ResTransformerUpsample {
@@ -159,12 +175,14 @@ pub fn load_res_transformer_upsample<B: Backend>(path: &str, device: &B::Device)
transformer: transformer, transformer: transformer,
upsample: upsample, upsample: upsample,
}; };
Ok(res_transformer_upsample) Ok(res_transformer_upsample)
} }
pub fn load_res_upsample<B: Backend>(
pub fn load_res_upsample<B: Backend>(path: &str, device: &B::Device) -> Result<ResUpSample<B>, Box<dyn Error>> { path: &str,
device: &B::Device,
) -> Result<ResUpSample<B>, Box<dyn Error>> {
let res = load_res_block::<B>(&format!("{}/{}", path, "res"), device)?; let res = load_res_block::<B>(&format!("{}/{}", path, "res"), device)?;
let upsample = load_upsample::<B>(&format!("{}/{}", path, "upsample"), device)?; let upsample = load_upsample::<B>(&format!("{}/{}", path, "upsample"), device)?;
@@ -172,25 +190,30 @@ pub fn load_res_upsample<B: Backend>(path: &str, device: &B::Device) -> Result<R
res: res, res: res,
upsample: upsample, upsample: upsample,
}; };
Ok(res_upsample) Ok(res_upsample)
} }
pub fn load_res_transformer<B: Backend>(
pub fn load_res_transformer<B: Backend>(path: &str, device: &B::Device) -> Result<ResTransformer<B>, Box<dyn Error>> { path: &str,
device: &B::Device,
) -> Result<ResTransformer<B>, Box<dyn Error>> {
let res = load_res_block::<B>(&format!("{}/{}", path, "res"), device)?; let res = load_res_block::<B>(&format!("{}/{}", path, "res"), device)?;
let transformer = load_spatial_transformer::<B>(&format!("{}/{}", path, "transformer"), device)?; let transformer =
load_spatial_transformer::<B>(&format!("{}/{}", path, "transformer"), device)?;
let res_transformer = ResTransformer { let res_transformer = ResTransformer {
res: res, res: res,
transformer: transformer, transformer: transformer,
}; };
Ok(res_transformer) Ok(res_transformer)
} }
pub fn load_unet_input_blocks<B: Backend>(
pub fn load_unet_input_blocks<B: Backend>(path: &str, device: &B::Device) -> Result<UNetInputBlocks<B>, Box<dyn Error>> { path: &str,
device: &B::Device,
) -> Result<UNetInputBlocks<B>, Box<dyn Error>> {
let conv = load_conv2d::<B>(&format!("{}/{}", path, "conv"), device)?; let conv = load_conv2d::<B>(&format!("{}/{}", path, "conv"), device)?;
let rt1 = load_res_transformer::<B>(&format!("{}/{}", path, "rt1"), device)?; let rt1 = load_res_transformer::<B>(&format!("{}/{}", path, "rt1"), device)?;
let rt2 = load_res_transformer::<B>(&format!("{}/{}", path, "rt2"), device)?; let rt2 = load_res_transformer::<B>(&format!("{}/{}", path, "rt2"), device)?;
@@ -218,11 +241,14 @@ pub fn load_unet_input_blocks<B: Backend>(path: &str, device: &B::Device) -> Res
r1: r1, r1: r1,
r2: r2, r2: r2,
}; };
Ok(unet_input_blocks) Ok(unet_input_blocks)
} }
pub fn load_unet_output_blocks<B: Backend>(path: &str, device: &B::Device) -> Result<UNetOutputBlocks<B>, Box<dyn Error>> { pub fn load_unet_output_blocks<B: Backend>(
path: &str,
device: &B::Device,
) -> Result<UNetOutputBlocks<B>, Box<dyn Error>> {
let r1 = load_res_block::<B>(&format!("{}/{}", path, "r1"), device)?; let r1 = load_res_block::<B>(&format!("{}/{}", path, "r1"), device)?;
let r2 = load_res_block::<B>(&format!("{}/{}", path, "r2"), device)?; let r2 = load_res_block::<B>(&format!("{}/{}", path, "r2"), device)?;
let ru = load_res_upsample::<B>(&format!("{}/{}", path, "ru"), device)?; let ru = load_res_upsample::<B>(&format!("{}/{}", path, "ru"), device)?;
@@ -252,14 +278,16 @@ pub fn load_unet_output_blocks<B: Backend>(path: &str, device: &B::Device) -> Re
}) })
} }
pub fn load_unet<B: Backend>(path: &str, device: &B::Device) -> Result<UNet<B>, Box<dyn Error>> { pub fn load_unet<B: Backend>(path: &str, device: &B::Device) -> Result<UNet<B>, Box<dyn Error>> {
let lin1_time_embed = load_linear::<B>(&format!("{}/{}", path, "lin1_time_embed"), device)?; let lin1_time_embed = load_linear::<B>(&format!("{}/{}", path, "lin1_time_embed"), device)?;
let silu_time_embed = SILU::new(); // Assuming SILU::new() initializes a new SILU struct let silu_time_embed = SILU::new(); // Assuming SILU::new() initializes a new SILU struct
let lin2_time_embed = load_linear::<B>(&format!("{}/{}", path, "lin2_time_embed"), device)?; let lin2_time_embed = load_linear::<B>(&format!("{}/{}", path, "lin2_time_embed"), device)?;
let input_blocks = load_unet_input_blocks::<B>(&format!("{}/{}", path, "input_blocks"), device)?; let input_blocks =
let middle_block = load_res_transformer_res::<B>(&format!("{}/{}", path, "middle_block"), device)?; load_unet_input_blocks::<B>(&format!("{}/{}", path, "input_blocks"), device)?;
let output_blocks = load_unet_output_blocks::<B>(&format!("{}/{}", path, "output_blocks"), device)?; let middle_block =
load_res_transformer_res::<B>(&format!("{}/{}", path, "middle_block"), device)?;
let output_blocks =
load_unet_output_blocks::<B>(&format!("{}/{}", path, "output_blocks"), device)?;
let norm_out = load_group_norm::<B>(&format!("{}/{}", path, "norm_out"), device)?; let norm_out = load_group_norm::<B>(&format!("{}/{}", path, "norm_out"), device)?;
let silu_out = SILU::new(); // Assuming SILU::new() initializes a new SILU struct let silu_out = SILU::new(); // Assuming SILU::new() initializes a new SILU struct
let conv_out = load_conv2d::<B>(&format!("{}/{}", path, "conv_out"), device)?; let conv_out = load_conv2d::<B>(&format!("{}/{}", path, "conv_out"), device)?;

View File

@@ -1,34 +1,34 @@
pub mod load; pub mod load;
use burn::{ use burn::{
config::Config, config::Config,
module::{Module, Param}, module::{Module, Param},
nn::{self, PaddingConfig2d, GELU, conv::{Conv2d, Conv2dConfig}}, nn::{
tensor::{ self,
backend::Backend, conv::{Conv2d, Conv2dConfig},
activation::softmax, PaddingConfig2d, GELU,
module::embedding,
Tensor,
Distribution,
Int,
}, },
tensor::{activation::softmax, backend::Backend, module::embedding, Distribution, Int, Tensor},
}; };
use super::silu::*;
use super::groupnorm::*; use super::groupnorm::*;
use crate::helper::to_float; use super::silu::*;
use super::attention::qkv_attention; use super::attention::qkv_attention;
fn timestep_embedding<B: Backend>(
fn timestep_embedding<B: Backend>(timesteps: Tensor<B, 1, Int>, dim: usize, max_period: usize) -> Tensor<B, 2> { timesteps: Tensor<B, 1, Int>,
dim: usize,
max_period: usize,
) -> Tensor<B, 2> {
let half = dim / 2; let half = dim / 2;
let freqs = ( to_float(Tensor::arange_device(0..half, &timesteps.device())) * (-(max_period as f64).ln() / half as f64 ) ).exp(); let freqs = (Tensor::arange_device(0..half, &timesteps.device()).float()
let args = to_float(timesteps) * freqs; * (-(max_period as f64).ln() / half as f64))
.exp();
let args = timesteps.float() * freqs;
Tensor::cat(vec![args.clone().cos(), args.sin()], 0).unsqueeze() Tensor::cat(vec![args.clone().cos(), args.sin()], 0).unsqueeze()
} }
#[derive(Config)] #[derive(Config)]
pub struct UNetConfig {} pub struct UNetConfig {}
@@ -39,7 +39,9 @@ impl UNetConfig {
let lin2_time_embed = nn::LinearConfig::new(1280, 1280).init(); let lin2_time_embed = nn::LinearConfig::new(1280, 1280).init();
let input_blocks = UNetInputBlocks { let input_blocks = UNetInputBlocks {
conv: Conv2dConfig::new([4, 320], [3, 3]).with_padding(PaddingConfig2d::Explicit(1, 1)).init(), conv: Conv2dConfig::new([4, 320], [3, 3])
.with_padding(PaddingConfig2d::Explicit(1, 1))
.init(),
rt1: ResTransformerConfig::new(320, 1280, 320, 768, 8).init(), rt1: ResTransformerConfig::new(320, 1280, 320, 768, 8).init(),
rt2: ResTransformerConfig::new(320, 1280, 320, 768, 8).init(), rt2: ResTransformerConfig::new(320, 1280, 320, 768, 8).init(),
d1: DownsampleConfig::new(320).init(), d1: DownsampleConfig::new(320).init(),
@@ -52,7 +54,7 @@ impl UNetConfig {
r1: ResBlockConfig::new(1280, 1280, 1280).init(), r1: ResBlockConfig::new(1280, 1280, 1280).init(),
r2: ResBlockConfig::new(1280, 1280, 1280).init(), r2: ResBlockConfig::new(1280, 1280, 1280).init(),
}; };
let middle_block = ResTransformerResConfig::new(1280, 1280, 1280, 768, 8).init(); let middle_block = ResTransformerResConfig::new(1280, 1280, 1280, 768, 8).init();
let output_blocks = UNetOutputBlocks { let output_blocks = UNetOutputBlocks {
@@ -72,37 +74,44 @@ impl UNetConfig {
let norm_out = GroupNormConfig::new(32, 320).init(); let norm_out = GroupNormConfig::new(32, 320).init();
let silu_out = SILU::new(); let silu_out = SILU::new();
let conv_out = Conv2dConfig::new([320, 4], [3, 3]).with_padding(PaddingConfig2d::Explicit(1, 1)).init(); let conv_out = Conv2dConfig::new([320, 4], [3, 3])
.with_padding(PaddingConfig2d::Explicit(1, 1))
.init();
UNet { UNet {
lin1_time_embed, lin1_time_embed,
silu_time_embed, silu_time_embed,
lin2_time_embed, lin2_time_embed,
input_blocks, input_blocks,
middle_block, middle_block,
output_blocks, output_blocks,
norm_out, norm_out,
silu_out, silu_out,
conv_out, conv_out,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct UNet<B: Backend> { pub struct UNet<B: Backend> {
lin1_time_embed: nn::Linear<B>, lin1_time_embed: nn::Linear<B>,
silu_time_embed: SILU, silu_time_embed: SILU,
lin2_time_embed: nn::Linear<B>, lin2_time_embed: nn::Linear<B>,
input_blocks: UNetInputBlocks<B>, input_blocks: UNetInputBlocks<B>,
middle_block: ResTransformerRes<B>, middle_block: ResTransformerRes<B>,
output_blocks: UNetOutputBlocks<B>, output_blocks: UNetOutputBlocks<B>,
norm_out: GroupNorm<B>, norm_out: GroupNorm<B>,
silu_out: SILU, silu_out: SILU,
conv_out: Conv2d<B>, conv_out: Conv2d<B>,
} }
impl<B: Backend> UNet<B> { impl<B: Backend> UNet<B> {
pub fn forward(&self, x: Tensor<B, 4>, timesteps: Tensor<B, 1, Int>, context: Tensor<B, 3>) -> Tensor<B, 4> { pub fn forward(
&self,
x: Tensor<B, 4>,
timesteps: Tensor<B, 1, Int>,
context: Tensor<B, 3>,
) -> Tensor<B, 4> {
let t_emb = timestep_embedding(timesteps, 320, 10000); let t_emb = timestep_embedding(timesteps, 320, 10000);
let emb = self.lin1_time_embed.forward(t_emb); let emb = self.lin1_time_embed.forward(t_emb);
let emb = self.silu_time_embed.forward(emb); let emb = self.silu_time_embed.forward(emb);
@@ -133,39 +142,27 @@ impl<B: Backend> UNet<B> {
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct UNetInputBlocks<B: Backend> { pub struct UNetInputBlocks<B: Backend> {
conv: Conv2d<B>, conv: Conv2d<B>,
rt1: ResTransformer<B>, rt1: ResTransformer<B>,
rt2: ResTransformer<B>, rt2: ResTransformer<B>,
d1: Downsample<B>, d1: Downsample<B>,
rt3: ResTransformer<B>, rt3: ResTransformer<B>,
rt4: ResTransformer<B>, rt4: ResTransformer<B>,
d2: Downsample<B>, d2: Downsample<B>,
rt5: ResTransformer<B>, rt5: ResTransformer<B>,
rt6: ResTransformer<B>, rt6: ResTransformer<B>,
d3: Downsample<B>, d3: Downsample<B>,
r1: ResBlock<B>, r1: ResBlock<B>,
r2: ResBlock<B>, r2: ResBlock<B>,
} }
impl<B: Backend> UNetInputBlocks<B> { impl<B: Backend> UNetInputBlocks<B> {
fn as_array(&self) -> [&dyn UNetBlock<B>; 12] { fn as_array(&self) -> [&dyn UNetBlock<B>; 12] {
[ [
&self.conv, &self.conv, &self.rt1, &self.rt2, &self.d1, &self.rt3, &self.rt4, &self.d2, &self.rt5,
&self.rt1, &self.rt6, &self.d3, &self.r1, &self.r2,
&self.rt2,
&self.d1,
&self.rt3,
&self.rt4,
&self.d2,
&self.rt5,
&self.rt6,
&self.d3,
&self.r1,
&self.r2,
] ]
} }
} }
@@ -177,67 +174,57 @@ pub struct UNetOutputBlocks<B: Backend> {
ru: ResUpSample<B>, ru: ResUpSample<B>,
rt1: ResTransformer<B>, rt1: ResTransformer<B>,
rt2: ResTransformer<B>, rt2: ResTransformer<B>,
rtu1: ResTransformerUpsample<B>, rtu1: ResTransformerUpsample<B>,
rt3: ResTransformer<B>, rt3: ResTransformer<B>,
rt4: ResTransformer<B>, rt4: ResTransformer<B>,
rtu2: ResTransformerUpsample<B>, rtu2: ResTransformerUpsample<B>,
rt5: ResTransformer<B>, rt5: ResTransformer<B>,
rt6: ResTransformer<B>, rt6: ResTransformer<B>,
rt7: ResTransformer<B>, rt7: ResTransformer<B>,
} }
impl<B: Backend> UNetOutputBlocks<B> { impl<B: Backend> UNetOutputBlocks<B> {
fn as_array(&self) -> [&dyn UNetBlock<B>; 12] { fn as_array(&self) -> [&dyn UNetBlock<B>; 12] {
[ [
&self.r1, &self.r1, &self.r2, &self.ru, &self.rt1, &self.rt2, &self.rtu1, &self.rt3, &self.rt4,
&self.r2, &self.rtu2, &self.rt5, &self.rt6, &self.rt7,
&self.ru,
&self.rt1,
&self.rt2,
&self.rtu1,
&self.rt3,
&self.rt4,
&self.rtu2,
&self.rt5,
&self.rt6,
&self.rt7,
] ]
} }
} }
trait UNetBlock<B: Backend> { trait UNetBlock<B: Backend> {
fn forward(&self, x: Tensor<B, 4>, emb: Tensor<B, 2>, context: Tensor<B, 3>) -> Tensor<B, 4>; fn forward(&self, x: Tensor<B, 4>, emb: Tensor<B, 2>, context: Tensor<B, 3>) -> Tensor<B, 4>;
} }
#[derive(Config)] #[derive(Config)]
pub struct ResTransformerConfig { pub struct ResTransformerConfig {
n_channels_in: usize, n_channels_in: usize,
n_channels_embed: usize, n_channels_embed: usize,
n_channels_out: usize, n_channels_out: usize,
n_context_state: usize, n_context_state: usize,
n_head: usize, n_head: usize,
} }
impl ResTransformerConfig { impl ResTransformerConfig {
fn init<B: Backend>(&self) -> ResTransformer<B> { fn init<B: Backend>(&self) -> ResTransformer<B> {
let res = ResBlockConfig::new(self.n_channels_in, self.n_channels_embed, self.n_channels_out).init(); let res = ResBlockConfig::new(
let transformer = SpatialTransformerConfig::new(self.n_channels_out, self.n_context_state, self.n_head).init(); self.n_channels_in,
self.n_channels_embed,
self.n_channels_out,
)
.init();
let transformer =
SpatialTransformerConfig::new(self.n_channels_out, self.n_context_state, self.n_head)
.init();
ResTransformer { ResTransformer { res, transformer }
res,
transformer,
}
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct ResTransformer<B: Backend> { pub struct ResTransformer<B: Backend> {
res: ResBlock<B>, res: ResBlock<B>,
transformer: SpatialTransformer<B>, transformer: SpatialTransformer<B>,
} }
impl<B: Backend> UNetBlock<B> for ResTransformer<B> { impl<B: Backend> UNetBlock<B> for ResTransformer<B> {
@@ -250,27 +237,29 @@ impl<B: Backend> UNetBlock<B> for ResTransformer<B> {
#[derive(Config)] #[derive(Config)]
pub struct ResUpSampleConfig { pub struct ResUpSampleConfig {
n_channels_in: usize, n_channels_in: usize,
n_channels_embed: usize, n_channels_embed: usize,
n_channels_out: usize, n_channels_out: usize,
} }
impl ResUpSampleConfig { impl ResUpSampleConfig {
fn init<B: Backend>(&self) -> ResUpSample<B> { fn init<B: Backend>(&self) -> ResUpSample<B> {
let res = ResBlockConfig::new(self.n_channels_in, self.n_channels_embed, self.n_channels_out).init(); let res = ResBlockConfig::new(
self.n_channels_in,
self.n_channels_embed,
self.n_channels_out,
)
.init();
let upsample = UpsampleConfig::new(self.n_channels_out).init(); let upsample = UpsampleConfig::new(self.n_channels_out).init();
ResUpSample { ResUpSample { res, upsample }
res,
upsample,
}
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct ResUpSample<B: Backend> { pub struct ResUpSample<B: Backend> {
res: ResBlock<B>, res: ResBlock<B>,
upsample: Upsample<B>, upsample: Upsample<B>,
} }
impl<B: Backend> UNetBlock<B> for ResUpSample<B> { impl<B: Backend> UNetBlock<B> for ResUpSample<B> {
@@ -283,32 +272,39 @@ impl<B: Backend> UNetBlock<B> for ResUpSample<B> {
#[derive(Config)] #[derive(Config)]
pub struct ResTransformerUpsampleConfig { pub struct ResTransformerUpsampleConfig {
n_channels_in: usize, n_channels_in: usize,
n_channels_embed: usize, n_channels_embed: usize,
n_channels_out: usize, n_channels_out: usize,
n_context_state: usize, n_context_state: usize,
n_head: usize, n_head: usize,
} }
impl ResTransformerUpsampleConfig { impl ResTransformerUpsampleConfig {
fn init<B: Backend>(&self) -> ResTransformerUpsample<B> { fn init<B: Backend>(&self) -> ResTransformerUpsample<B> {
let res = ResBlockConfig::new(self.n_channels_in, self.n_channels_embed, self.n_channels_out).init(); let res = ResBlockConfig::new(
let transformer = SpatialTransformerConfig::new(self.n_channels_out, self.n_context_state, self.n_head).init(); self.n_channels_in,
self.n_channels_embed,
self.n_channels_out,
)
.init();
let transformer =
SpatialTransformerConfig::new(self.n_channels_out, self.n_context_state, self.n_head)
.init();
let upsample = UpsampleConfig::new(self.n_channels_out).init(); let upsample = UpsampleConfig::new(self.n_channels_out).init();
ResTransformerUpsample { ResTransformerUpsample {
res, res,
transformer, transformer,
upsample, upsample,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct ResTransformerUpsample<B: Backend> { pub struct ResTransformerUpsample<B: Backend> {
res: ResBlock<B>, res: ResBlock<B>,
transformer: SpatialTransformer<B>, transformer: SpatialTransformer<B>,
upsample: Upsample<B>, upsample: Upsample<B>,
} }
impl<B: Backend> UNetBlock<B> for ResTransformerUpsample<B> { impl<B: Backend> UNetBlock<B> for ResTransformerUpsample<B> {
@@ -322,32 +318,44 @@ impl<B: Backend> UNetBlock<B> for ResTransformerUpsample<B> {
#[derive(Config)] #[derive(Config)]
pub struct ResTransformerResConfig { pub struct ResTransformerResConfig {
n_channels_in: usize, n_channels_in: usize,
n_channels_embed: usize, n_channels_embed: usize,
n_channels_out: usize, n_channels_out: usize,
n_context_state: usize, n_context_state: usize,
n_head: usize, n_head: usize,
} }
impl ResTransformerResConfig { impl ResTransformerResConfig {
fn init<B: Backend>(&self) -> ResTransformerRes<B> { fn init<B: Backend>(&self) -> ResTransformerRes<B> {
let res1 = ResBlockConfig::new(self.n_channels_in, self.n_channels_embed, self.n_channels_out).init(); let res1 = ResBlockConfig::new(
let transformer = SpatialTransformerConfig::new(self.n_channels_out, self.n_context_state, self.n_head).init(); self.n_channels_in,
let res2 = ResBlockConfig::new(self.n_channels_in, self.n_channels_embed, self.n_channels_out).init(); self.n_channels_embed,
self.n_channels_out,
)
.init();
let transformer =
SpatialTransformerConfig::new(self.n_channels_out, self.n_context_state, self.n_head)
.init();
let res2 = ResBlockConfig::new(
self.n_channels_in,
self.n_channels_embed,
self.n_channels_out,
)
.init();
ResTransformerRes { ResTransformerRes {
res1, res1,
transformer, transformer,
res2, res2,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct ResTransformerRes<B: Backend> { pub struct ResTransformerRes<B: Backend> {
res1: ResBlock<B>, res1: ResBlock<B>,
transformer: SpatialTransformer<B>, transformer: SpatialTransformer<B>,
res2: ResBlock<B>, res2: ResBlock<B>,
} }
impl<B: Backend> UNetBlock<B> for ResTransformerRes<B> { impl<B: Backend> UNetBlock<B> for ResTransformerRes<B> {
@@ -359,11 +367,9 @@ impl<B: Backend> UNetBlock<B> for ResTransformerRes<B> {
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct UpsampleConfig { pub struct UpsampleConfig {
n_channels: usize, n_channels: usize,
} }
impl UpsampleConfig { impl UpsampleConfig {
@@ -372,25 +378,23 @@ impl UpsampleConfig {
.with_padding(PaddingConfig2d::Explicit(1, 1)) .with_padding(PaddingConfig2d::Explicit(1, 1))
.init(); .init();
Upsample { Upsample { conv }
conv,
}
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct Upsample<B: Backend> { pub struct Upsample<B: Backend> {
conv: Conv2d<B>, conv: Conv2d<B>,
} }
impl<B: Backend> Upsample<B> { impl<B: Backend> Upsample<B> {
fn forward(&self, x: Tensor<B, 4>) -> Tensor<B, 4> { fn forward(&self, x: Tensor<B, 4>) -> Tensor<B, 4> {
let [n_batch, n_channel, height, width] = x.dims(); let [n_batch, n_channel, height, width] = x.dims();
let x = x let x = x
.reshape([n_batch, n_channel, height, 1, width, 1]) .reshape([n_batch, n_channel, height, 1, width, 1])
.repeat(3, 2) .repeat(3, 2)
.repeat(5, 2) .repeat(5, 2)
.reshape([n_batch, n_channel, 2 * height, 2 * width]); .reshape([n_batch, n_channel, 2 * height, 2 * width]);
self.conv.forward(x) self.conv.forward(x)
} }
} }
@@ -403,7 +407,7 @@ impl<B: Backend> UNetBlock<B> for Upsample<B> {
#[derive(Config)] #[derive(Config)]
pub struct DownsampleConfig { pub struct DownsampleConfig {
n_channels: usize, n_channels: usize,
} }
impl DownsampleConfig { impl DownsampleConfig {
@@ -423,38 +427,36 @@ impl<B: Backend> UNetBlock<B> for Conv2d<B> {
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct SpatialTransformerConfig { pub struct SpatialTransformerConfig {
n_channels: usize, n_channels: usize,
n_context_state: usize, n_context_state: usize,
n_head: usize, n_head: usize,
} }
impl SpatialTransformerConfig { impl SpatialTransformerConfig {
fn init<B: Backend>(&self) -> SpatialTransformer<B> { fn init<B: Backend>(&self) -> SpatialTransformer<B> {
let norm = GroupNormConfig::new(32, self.n_channels).init(); let norm = GroupNormConfig::new(32, self.n_channels).init();
let proj_in = Conv2dConfig::new([self.n_channels, self.n_channels], [1, 1]).init(); let proj_in = Conv2dConfig::new([self.n_channels, self.n_channels], [1, 1]).init();
let transformer = TransformerBlockConfig::new(self.n_channels, self.n_context_state, self.n_head).init(); let transformer =
TransformerBlockConfig::new(self.n_channels, self.n_context_state, self.n_head).init();
let proj_out = Conv2dConfig::new([self.n_channels, self.n_channels], [1, 1]).init(); let proj_out = Conv2dConfig::new([self.n_channels, self.n_channels], [1, 1]).init();
SpatialTransformer { SpatialTransformer {
norm, norm,
proj_in, proj_in,
transformer, transformer,
proj_out, proj_out,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct SpatialTransformer<B: Backend> { pub struct SpatialTransformer<B: Backend> {
norm: GroupNorm<B>, norm: GroupNorm<B>,
proj_in: Conv2d<B>, proj_in: Conv2d<B>,
transformer: TransformerBlock<B>, transformer: TransformerBlock<B>,
proj_out: Conv2d<B>, proj_out: Conv2d<B>,
} }
impl<B: Backend> SpatialTransformer<B> { impl<B: Backend> SpatialTransformer<B> {
@@ -465,9 +467,13 @@ impl<B: Backend> SpatialTransformer<B> {
let x = self.norm.forward(x); let x = self.norm.forward(x);
let x = self.proj_in.forward(x); let x = self.proj_in.forward(x);
let x = x.reshape([n_batch, n_channel, height * width]).swap_dims(1, 2); let x = x
.reshape([n_batch, n_channel, height * width])
.swap_dims(1, 2);
let x = self.transformer.forward(x, context) let x = self
.transformer
.forward(x, context)
.swap_dims(1, 2) .swap_dims(1, 2)
.reshape([n_batch, n_channel, height, width]); .reshape([n_batch, n_channel, height, width]);
@@ -475,18 +481,11 @@ impl<B: Backend> SpatialTransformer<B> {
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct TransformerBlockConfig { pub struct TransformerBlockConfig {
n_state: usize, n_state: usize,
n_context_state: usize, n_context_state: usize,
n_head: usize, n_head: usize,
} }
impl TransformerBlockConfig { impl TransformerBlockConfig {
@@ -494,44 +493,44 @@ impl TransformerBlockConfig {
let norm1 = nn::LayerNormConfig::new(self.n_state).init(); let norm1 = nn::LayerNormConfig::new(self.n_state).init();
let attn1 = MultiHeadAttentionConfig::new(self.n_state, self.n_state, self.n_head).init(); let attn1 = MultiHeadAttentionConfig::new(self.n_state, self.n_state, self.n_head).init();
let norm2 = nn::LayerNormConfig::new(self.n_state).init(); let norm2 = nn::LayerNormConfig::new(self.n_state).init();
let attn2 = MultiHeadAttentionConfig::new(self.n_state, self.n_context_state, self.n_head).init(); let attn2 =
MultiHeadAttentionConfig::new(self.n_state, self.n_context_state, self.n_head).init();
let norm3 = nn::LayerNormConfig::new(self.n_state).init(); let norm3 = nn::LayerNormConfig::new(self.n_state).init();
let mlp = MLPConfig::new(self.n_state, 4).init(); let mlp = MLPConfig::new(self.n_state, 4).init();
TransformerBlock { TransformerBlock {
norm1, norm1,
attn1, attn1,
norm2, norm2,
attn2, attn2,
norm3, norm3,
mlp, mlp,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct TransformerBlock<B: Backend> { pub struct TransformerBlock<B: Backend> {
norm1: nn::LayerNorm<B>, norm1: nn::LayerNorm<B>,
attn1: MultiHeadAttention<B>, attn1: MultiHeadAttention<B>,
norm2: nn::LayerNorm<B>, norm2: nn::LayerNorm<B>,
attn2: MultiHeadAttention<B>, attn2: MultiHeadAttention<B>,
norm3: nn::LayerNorm<B>, norm3: nn::LayerNorm<B>,
mlp: MLP<B>, mlp: MLP<B>,
} }
impl<B: Backend> TransformerBlock<B> { impl<B: Backend> TransformerBlock<B> {
fn forward(&self, x: Tensor<B, 3>, context: Tensor<B, 3>) -> Tensor<B, 3> { fn forward(&self, x: Tensor<B, 3>, context: Tensor<B, 3>) -> Tensor<B, 3> {
let x = x.clone() + self.attn1.forward( self.norm1.forward(x), None); let x = x.clone() + self.attn1.forward(self.norm1.forward(x), None);
let x = x.clone() + self.attn2.forward( self.norm2.forward(x), Some(context)); let x = x.clone() + self.attn2.forward(self.norm2.forward(x), Some(context));
x.clone() + self.mlp.forward( self.norm3.forward(x) ) x.clone() + self.mlp.forward(self.norm3.forward(x))
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct MLPConfig { pub struct MLPConfig {
n_state: usize, n_state: usize,
mult: usize, mult: usize,
} }
impl MLPConfig { impl MLPConfig {
@@ -540,30 +539,26 @@ impl MLPConfig {
let geglu = GEGLUConfig::new(self.n_state, n_state_hidden).init(); let geglu = GEGLUConfig::new(self.n_state, n_state_hidden).init();
let lin = nn::LinearConfig::new(n_state_hidden, self.n_state).init(); let lin = nn::LinearConfig::new(n_state_hidden, self.n_state).init();
MLP { MLP { geglu, lin }
geglu,
lin,
}
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct MLP<B: Backend> { pub struct MLP<B: Backend> {
geglu: GEGLU<B>, geglu: GEGLU<B>,
lin: nn::Linear<B>, lin: nn::Linear<B>,
} }
impl<B: Backend> MLP<B> { impl<B: Backend> MLP<B> {
pub fn forward(&self, x: Tensor<B, 3>) -> Tensor<B, 3> { pub fn forward(&self, x: Tensor<B, 3>) -> Tensor<B, 3> {
self.lin.forward( self.geglu.forward(x) ) self.lin.forward(self.geglu.forward(x))
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct GEGLUConfig { pub struct GEGLUConfig {
n_state_in: usize, n_state_in: usize,
n_state_out: usize, n_state_out: usize,
} }
impl GEGLUConfig { impl GEGLUConfig {
@@ -571,17 +566,14 @@ impl GEGLUConfig {
let proj = nn::LinearConfig::new(self.n_state_in, 2 * self.n_state_out).init(); let proj = nn::LinearConfig::new(self.n_state_in, 2 * self.n_state_out).init();
let gelu = GELU::new(); let gelu = GELU::new();
GEGLU { GEGLU { proj, gelu }
proj,
gelu,
}
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct GEGLU<B: Backend> { pub struct GEGLU<B: Backend> {
proj: nn::Linear<B>, proj: nn::Linear<B>,
gelu: GELU, gelu: GELU,
} }
impl<B: Backend> GEGLU<B> { impl<B: Backend> GEGLU<B> {
@@ -591,51 +583,60 @@ impl<B: Backend> GEGLU<B> {
let n_state_out = n_state / 2; let n_state_out = n_state / 2;
let x = projected.clone().slice([0..n_batch, 0..n_ctx, 0..n_state_out]); let x = projected
.clone()
.slice([0..n_batch, 0..n_ctx, 0..n_state_out]);
let gate = projected.slice([0..n_batch, 0..n_ctx, n_state_out..n_state]); let gate = projected.slice([0..n_batch, 0..n_ctx, n_state_out..n_state]);
x * self.gelu.forward(gate) x * self.gelu.forward(gate)
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct MultiHeadAttentionConfig { pub struct MultiHeadAttentionConfig {
n_state: usize, n_state: usize,
n_context_state: usize, n_context_state: usize,
n_head: usize, n_head: usize,
} }
impl MultiHeadAttentionConfig { impl MultiHeadAttentionConfig {
fn init<B: Backend>(&self) -> MultiHeadAttention<B> { fn init<B: Backend>(&self) -> MultiHeadAttention<B> {
assert!(self.n_state % self.n_head == 0, "State size {} must be a multiple of head size {}", self.n_state, self.n_head); assert!(
self.n_state % self.n_head == 0,
"State size {} must be a multiple of head size {}",
self.n_state,
self.n_head
);
let n_head = self.n_head; let n_head = self.n_head;
let query = nn::LinearConfig::new(self.n_state, self.n_state).with_bias(false).init(); let query = nn::LinearConfig::new(self.n_state, self.n_state)
let key = nn::LinearConfig::new(self.n_context_state, self.n_state).with_bias(false).init(); .with_bias(false)
let value = nn::LinearConfig::new(self.n_context_state, self.n_state).with_bias(false).init(); .init();
let key = nn::LinearConfig::new(self.n_context_state, self.n_state)
.with_bias(false)
.init();
let value = nn::LinearConfig::new(self.n_context_state, self.n_state)
.with_bias(false)
.init();
let out = nn::LinearConfig::new(self.n_state, self.n_state).init(); let out = nn::LinearConfig::new(self.n_state, self.n_state).init();
MultiHeadAttention { MultiHeadAttention {
n_head, n_head,
query, query,
key, key,
value, value,
out out,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct MultiHeadAttention<B: Backend> { pub struct MultiHeadAttention<B: Backend> {
n_head: usize, n_head: usize,
query: nn::Linear<B>, query: nn::Linear<B>,
key: nn::Linear<B>, key: nn::Linear<B>,
value: nn::Linear<B>, value: nn::Linear<B>,
out: nn::Linear<B>, out: nn::Linear<B>,
} }
impl<B: Backend> MultiHeadAttention<B> { impl<B: Backend> MultiHeadAttention<B> {
@@ -652,74 +653,61 @@ impl<B: Backend> MultiHeadAttention<B> {
} }
} }
#[derive(Config)] #[derive(Config)]
pub struct ResBlockConfig { pub struct ResBlockConfig {
n_channels_in: usize, n_channels_in: usize,
n_channels_embed: usize, n_channels_embed: usize,
n_channels_out: usize, n_channels_out: usize,
} }
impl ResBlockConfig { impl ResBlockConfig {
fn init<B: Backend>(&self) -> ResBlock<B> { fn init<B: Backend>(&self) -> ResBlock<B> {
let norm_in = GroupNormConfig::new(32, self.n_channels_in).init(); let norm_in = GroupNormConfig::new(32, self.n_channels_in).init();
let silu_in = SILU::new(); let silu_in = SILU::new();
let conv_in = Conv2dConfig::new([self.n_channels_in, self.n_channels_out], [3, 3]).with_padding(PaddingConfig2d::Explicit(1, 1)).init(); let conv_in = Conv2dConfig::new([self.n_channels_in, self.n_channels_out], [3, 3])
.with_padding(PaddingConfig2d::Explicit(1, 1))
.init();
let silu_embed = SILU::new(); let silu_embed = SILU::new();
let lin_embed = nn::LinearConfig::new(self.n_channels_embed, self.n_channels_out).init(); let lin_embed = nn::LinearConfig::new(self.n_channels_embed, self.n_channels_out).init();
let norm_out = GroupNormConfig::new(32, self.n_channels_out).init(); let norm_out = GroupNormConfig::new(32, self.n_channels_out).init();
let silu_out = SILU::new(); let silu_out = SILU::new();
let conv_out = Conv2dConfig::new([self.n_channels_out, self.n_channels_out], [3, 3]).with_padding(PaddingConfig2d::Explicit(1, 1)).init(); let conv_out = Conv2dConfig::new([self.n_channels_out, self.n_channels_out], [3, 3])
.with_padding(PaddingConfig2d::Explicit(1, 1))
.init();
let skip_connection = if self.n_channels_in != self.n_channels_out { let skip_connection = if self.n_channels_in != self.n_channels_out {
Some( Conv2dConfig::new([self.n_channels_in, self.n_channels_out], [1, 1]).init() ) Some(Conv2dConfig::new([self.n_channels_in, self.n_channels_out], [1, 1]).init())
} else { } else {
None None
}; };
ResBlock { ResBlock {
norm_in, norm_in,
silu_in, silu_in,
conv_in, conv_in,
silu_embed, silu_embed,
lin_embed, lin_embed,
norm_out, norm_out,
silu_out, silu_out,
conv_out, conv_out,
skip_connection, skip_connection,
} }
} }
} }
#[derive(Module, Debug)] #[derive(Module, Debug)]
pub struct ResBlock<B: Backend> { pub struct ResBlock<B: Backend> {
norm_in: GroupNorm<B>, norm_in: GroupNorm<B>,
silu_in: SILU, silu_in: SILU,
conv_in: Conv2d<B>, conv_in: Conv2d<B>,
silu_embed: SILU, silu_embed: SILU,
lin_embed: nn::Linear<B>, lin_embed: nn::Linear<B>,
norm_out: GroupNorm<B>, norm_out: GroupNorm<B>,
silu_out: SILU, silu_out: SILU,
conv_out: Conv2d<B>, conv_out: Conv2d<B>,
skip_connection: Option<Conv2d<B>>, skip_connection: Option<Conv2d<B>>,
} }
impl<B: Backend> ResBlock<B> { impl<B: Backend> ResBlock<B> {
@@ -730,7 +718,7 @@ impl<B: Backend> ResBlock<B> {
let embed_out = self.silu_embed.forward(embed); let embed_out = self.silu_embed.forward(embed);
let embed_out = self.lin_embed.forward(embed_out); let embed_out = self.lin_embed.forward(embed_out);
let [n_batch_embed, n_state_embed] = embed_out.dims(); let [n_batch_embed, n_state_embed] = embed_out.dims();
let h = h + embed_out.reshape([n_batch_embed, n_state_embed, 1, 1]); let h = h + embed_out.reshape([n_batch_embed, n_state_embed, 1, 1]);
@@ -751,5 +739,3 @@ impl<B: Backend> UNetBlock<B> for ResBlock<B> {
self.forward(x, emb) self.forward(x, emb)
} }
} }

View File

@@ -1,13 +1,14 @@
use std::collections::HashMap;
use regex::Regex; use regex::Regex;
use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use std::io::{self, BufRead}; use std::io::{self, BufRead};
fn bytes_to_unicode() -> Vec<(u8, char)> { fn bytes_to_unicode() -> Vec<(u8, char)> {
let mut bs: Vec<u8> = ('!' as u8 ..= '~' as u8).into_iter() let mut bs: Vec<u8> = ('!' as u8..='~' as u8)
.chain( ('¡' as u8..='¬' as u8).into_iter() ) .into_iter()
.chain( ('®' as u8..='ÿ' as u8).into_iter() ) .chain(('¡' as u8..='¬' as u8).into_iter())
.chain(('®' as u8..='ÿ' as u8).into_iter())
.collect(); .collect();
let mut cs: Vec<_> = bs.iter().cloned().map(char::from).collect(); let mut cs: Vec<_> = bs.iter().cloned().map(char::from).collect();
@@ -16,25 +17,21 @@ fn bytes_to_unicode() -> Vec<(u8, char)> {
for b in 0u8..=255u8 { for b in 0u8..=255u8 {
if !bs.contains(&b) { if !bs.contains(&b) {
bs.push(b); bs.push(b);
cs.push( char::from_u32(256 + n).unwrap() ); cs.push(char::from_u32(256 + n).unwrap());
n += 1; n += 1;
} }
} }
bs.into_iter() bs.into_iter()
.zip( .zip(cs.into_iter().map(|c| c.into()))
cs.into_iter() .collect()
.map(|c| c.into())
).collect()
} }
fn get_pairs(word: &[String]) -> Vec<(String, String)> { fn get_pairs(word: &[String]) -> Vec<(String, String)> {
let prev = word.into_iter().cloned(); let prev = word.into_iter().cloned();
let next = prev.clone().skip(1); let next = prev.clone().skip(1);
prev prev.zip(next).collect()
.zip(next)
.collect()
} }
fn whitespace_clean(text: &str) -> String { fn whitespace_clean(text: &str) -> String {
@@ -44,24 +41,27 @@ fn whitespace_clean(text: &str) -> String {
fn load_merges(path: &str) -> io::Result<Vec<(String, String)>> { fn load_merges(path: &str) -> io::Result<Vec<(String, String)>> {
let file = File::open(&path)?; let file = File::open(&path)?;
let reader = io::BufReader::new(file); let reader = io::BufReader::new(file);
let mut merges = Vec::new(); let mut merges = Vec::new();
for line in reader.lines() { for line in reader.lines() {
let line = line?; let line = line?;
let mut words = line.split_whitespace(); let mut words = line.split_whitespace();
if let (Some(word1), Some(word2)) = (words.next(), words.next()) { if let (Some(word1), Some(word2)) = (words.next(), words.next()) {
merges.push((word1.into(), word2.into())); merges.push((word1.into(), word2.into()));
} }
} }
Ok(merges) Ok(merges)
} }
fn construct_vocab(chars: impl Iterator<Item=char> + Clone, merges: &[(String, String)]) -> Vec<String> { fn construct_vocab(
chars: impl Iterator<Item = char> + Clone,
merges: &[(String, String)],
) -> Vec<String> {
let iter = chars.map(String::from); let iter = chars.map(String::from);
let mut vocab: Vec<_> = iter.clone().chain( iter.map(|c| c + "</w>") ).collect(); let mut vocab: Vec<_> = iter.clone().chain(iter.map(|c| c + "</w>")).collect();
for merge in merges { for merge in merges {
vocab.push(format!("{}{}", merge.0, merge.1)); vocab.push(format!("{}{}", merge.0, merge.1));
@@ -79,7 +79,7 @@ pub struct SimpleTokenizer {
decoder: HashMap<u32, String>, decoder: HashMap<u32, String>,
bpe_ranks: HashMap<(String, String), u32>, bpe_ranks: HashMap<(String, String), u32>,
cache: HashMap<String, String>, cache: HashMap<String, String>,
pat: Regex, pat: Regex,
} }
impl SimpleTokenizer { impl SimpleTokenizer {
@@ -87,10 +87,10 @@ impl SimpleTokenizer {
let byte_unicode_values = bytes_to_unicode(); let byte_unicode_values = bytes_to_unicode();
let byte_encoder: HashMap<_, _> = byte_unicode_values.iter().cloned().collect(); let byte_encoder: HashMap<_, _> = byte_unicode_values.iter().cloned().collect();
let byte_decoder = byte_encoder.iter().map(|(k,v)| (*v,*k)).collect(); let byte_decoder = byte_encoder.iter().map(|(k, v)| (*v, *k)).collect();
let merges = load_merges("bpe_simple_vocab_16e6.txt")?; let merges = load_merges("bpe_simple_vocab_16e6.txt")?;
let merges = merges[1..49152-256-2+1].to_vec(); let merges = merges[1..49152 - 256 - 2 + 1].to_vec();
let vocab = construct_vocab(byte_unicode_values.into_iter().map(|(_, u)| u), &merges[..]); let vocab = construct_vocab(byte_unicode_values.into_iter().map(|(_, u)| u), &merges[..]);
@@ -98,38 +98,39 @@ impl SimpleTokenizer {
let decoder: HashMap<u32, String> = encoder.iter().map(|(k, v)| (*v, k.clone())).collect(); let decoder: HashMap<u32, String> = encoder.iter().map(|(k, v)| (*v, k.clone())).collect();
let bpe_ranks = merges.iter().cloned().zip((0..).into_iter()).collect(); let bpe_ranks = merges.iter().cloned().zip((0..).into_iter()).collect();
let cache = HashMap::from([ let cache = HashMap::from([
("<|startoftext|>".to_string(), "<|startoftext|>".to_string()), ("<|startoftext|>".to_string(), "<|startoftext|>".to_string()),
("<|endoftext|>".to_string(), "<|endoftext|>".to_string()), ("<|endoftext|>".to_string(), "<|endoftext|>".to_string()),
]); ]);
let pat = Regex::new(r"(?i)<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|\p{L}+|\p{N}|[^\s\p{L}\p{N}]+").unwrap(); let pat = Regex::new(r"(?i)<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|\p{L}+|\p{N}|[^\s\p{L}\p{N}]+").unwrap();
Ok( SimpleTokenizer { Ok(SimpleTokenizer {
byte_encoder: byte_encoder, byte_encoder: byte_encoder,
byte_decoder: byte_decoder, byte_decoder: byte_decoder,
encoder: encoder, encoder: encoder,
decoder: decoder, decoder: decoder,
bpe_ranks: bpe_ranks, bpe_ranks: bpe_ranks,
cache: cache, cache: cache,
pat: pat, pat: pat,
} ) })
} }
pub fn bpe(&self, token: &str) -> String { pub fn bpe(&self, token: &str) -> String {
if let Some(word) = self.cache.get(token) { if let Some(word) = self.cache.get(token) {
return word.clone(); return word.clone();
} }
let mut word: Vec<String> = token.chars().map(|c| c.to_string()).collect(); let mut word: Vec<String> = token.chars().map(|c| c.to_string()).collect();
word.last_mut().map(|w| *w += "</w>"); word.last_mut().map(|w| *w += "</w>");
let mut pairs = get_pairs(&word); let mut pairs = get_pairs(&word);
if pairs.is_empty() { if pairs.is_empty() {
return format!("{}{}", token, "</w>"); return format!("{}{}", token, "</w>");
} }
loop { loop {
let bigram = pairs.iter() let bigram = pairs
.iter()
.filter(|pair| self.bpe_ranks.contains_key(pair)) .filter(|pair| self.bpe_ranks.contains_key(pair))
.min_by_key(|&pair| self.bpe_ranks[pair]); .min_by_key(|&pair| self.bpe_ranks[pair]);
@@ -141,14 +142,14 @@ impl SimpleTokenizer {
let mut new_word = Vec::new(); let mut new_word = Vec::new();
let mut i = 0; let mut i = 0;
while i < word.len() { while i < word.len() {
if let Some( (j, _) ) = word.iter().enumerate().skip(i).find(|(_, w)| w == &first) { if let Some((j, _)) = word.iter().enumerate().skip(i).find(|(_, w)| w == &first) {
new_word.extend(word[i..j].iter().cloned()); new_word.extend(word[i..j].iter().cloned());
i = j; i = j;
} else { } else {
new_word.extend(word[i..].iter().cloned()); new_word.extend(word[i..].iter().cloned());
break; break;
} }
if &word[i] == first && i < word.len() - 1 && &word[i + 1] == second { if &word[i] == first && i < word.len() - 1 && &word[i + 1] == second {
new_word.push(format!("{}{}", first, second)); new_word.push(format!("{}{}", first, second));
i += 2; i += 2;
@@ -157,7 +158,7 @@ impl SimpleTokenizer {
i += 1; i += 1;
} }
} }
word = new_word; word = new_word;
if word.len() == 1 { if word.len() == 1 {
break; break;
@@ -170,7 +171,7 @@ impl SimpleTokenizer {
//self.cache.insert(token.into(), word); //self.cache.insert(token.into(), word);
return word; return word;
} }
pub fn encode(&self, text: &str) -> Vec<u32> { pub fn encode(&self, text: &str) -> Vec<u32> {
let cleaned_text = whitespace_clean(text.trim()).to_lowercase(); let cleaned_text = whitespace_clean(text.trim()).to_lowercase();
@@ -178,8 +179,16 @@ impl SimpleTokenizer {
for m in self.pat.find_iter(&cleaned_text) { for m in self.pat.find_iter(&cleaned_text) {
let token = m.as_str(); let token = m.as_str();
let token: String = token.as_bytes().into_iter().map(|b| self.byte_encoder[b]).collect(); let token: String = token
bpe_tokens.extend(self.bpe(&token).split(' ').map(|bpe_token| self.encoder[bpe_token])) .as_bytes()
.into_iter()
.map(|b| self.byte_encoder[b])
.collect();
bpe_tokens.extend(
self.bpe(&token)
.split(' ')
.map(|bpe_token| self.encoder[bpe_token]),
)
} }
return bpe_tokens; return bpe_tokens;
@@ -187,9 +196,7 @@ impl SimpleTokenizer {
pub fn decode(&self, tokens: &[u32]) -> String { pub fn decode(&self, tokens: &[u32]) -> String {
let text: String = tokens.iter().map(|t| self.decoder[t].as_str()).collect(); let text: String = tokens.iter().map(|t| self.decoder[t].as_str()).collect();
let decoded_bytes: Vec<u8> = text.chars() let decoded_bytes: Vec<u8> = text.chars().map(|c| self.byte_decoder[&c]).collect();
.map(|c| self.byte_decoder[&c])
.collect();
String::from_utf8_lossy(&decoded_bytes[..]).replace("</w>", " ") String::from_utf8_lossy(&decoded_bytes[..]).replace("</w>", " ")
} }
@@ -212,4 +219,4 @@ mod tests {
let decoded = tokenizer.decode(&encoded[..]); let decoded = tokenizer.decode(&encoded[..]);
assert_eq!(target_decode, decoded); assert_eq!(target_decode, decoded);
} }
} }