In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install transformers

In [None]:
!unzip aa.zip

In [None]:
!pip install wandb
!wandb login

In [1]:
import clip
import os
from torch import nn
import numpy as np
import torch
import torch.nn.functional as nnf
import sys
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm, trange
import PIL.Image
from IPython.display import Image 
import pandas as pd


device = 'cuda' if torch.cuda.is_available() else 'cpu'



In [2]:


class MLP(nn.Module):
    
    def __init__(self, sizes, bias=True, act=nn.Tanh):
        super(MLP, self).__init__()
        layers = []
        for i in range(len(sizes) -1):
            layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=bias))
            if i < len(sizes) - 2:
                layers.append(act())
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

    


class ClipCaptionModel(nn.Module):
    
    def __init__(self, prefix_length, prefix_size: int = 768):
        super(ClipCaptionModel, self).__init__()
        self.prefix_length = prefix_length

        self.gpt = GPT2LMHeadModel.from_pretrained('sberbank-ai/rugpt3large_based_on_gpt2')
        
        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
        
        if prefix_length > 10:  # not enough memory
            self.clip_project = nn.Linear(10,47)#prefix_size, self.gpt_embedding_size * prefix_length)
        else:
            self.clip_project = MLP((prefix_size, (self.gpt_embedding_size * prefix_length) // 2, self.gpt_embedding_size * prefix_length))

    #@functools.lru_cache #FIXME
    def get_dummy_token(self, batch_size, device):
        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)

    def forward(self, tokens, prefix, mask, labels):
        embedding_text = self.gpt.transformer.wte(tokens)
        prefix_projections = self.clip_project(prefix).view(-1, self.prefix_length, self.gpt_embedding_size)
        #print(embedding_text.size()) #torch.Size([5, 67, 768])
        #print(prefix_projections.size()) #torch.Size([5, 1, 768])
        embedding_cat = torch.cat((prefix_projections, embedding_text), dim=1)
        if labels is not None:
            dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
            labels = torch.cat((dummy_token, tokens), dim=1)
        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
        return out

    

class ClipCaptionPrefix(ClipCaptionModel):

    def parameters(self, recurse = True):
        return self.clip_project.parameters()

    def train(self, mode = True):
        super(ClipCaptionPrefix, self).train(mode)
        self.gpt.eval()
        return self

In [3]:
df_train = pd.read_csv('ru_train_full.csv')

In [4]:
import os
data = []
for video_name, question, answer in zip(df_train.video_name, df_train.question, df_train.answer):
    name = f'videos/{video_name}.mp4'
    if os.path.exists(name):  
        data += [(name,f'Q: {question} A: {answer}')]

In [5]:
import io
import os
import PIL
import random
import numpy as np
import torch
import torchvision
import transformers
import more_itertools
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
from torch.utils.data import Dataset
from tqdm import tqdm
from dataclasses import dataclass, field
import torchvision.transforms as T
import torchvision.transforms.functional as TF
import cv2
import clip
from PIL import Image
import pickle
from tqdm.contrib import tzip
from tqdm.notebook import tqdm

def image_grid(imgs, rows, cols):
    pils = imgs
    
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

def read_video(path, transform=None, frames_num=16, window=30):
    frames = []
    cap = cv2.VideoCapture(path)
    
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    N = length//(frames_num)
    #print(length)
    #counter = 
    
    current_frame = 1
    for i in range(length):
    
        #frameId = int(round(cap.get(current_frame))) 
        #print(current_frame)
        ret, frame = cap.read(current_frame)
    
        
        
        
        
        if ret and i==current_frame and len(frames)<frames_num:
            size = 64, 64
            frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            frame.thumbnail(size, Image.ANTIALIAS)
            
            frames.append(frame)
            current_frame += N
        
       
        #print(current_frame)
        #cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame)
        
        
    cap.release()
    #print(frames)
    return frames

In [6]:
device = torch.device('cuda')
clip_model_type = "ViT-L/14@336px"

out_path = f"Features_train_full_ru.pkl"
video_path =  'videos'


clip_model, preprocess = clip.load(clip_model_type, device=device, jit=False)

# path_a = 'activitynet-qa/dataset/train_a.json'
# path_q = 'activitynet-qa/dataset/train_q.json'
# df_a = pd.read_json(path_a)
# df_q = pd.read_json(path_q)

In [7]:
clip_model.to(device)
None

In [None]:
all_embeddings = []
all_captions = []
i = 0

for video_name, question, answer in tzip(df_train.video_name, df_train.question, df_train.answer):
    
    
    name = f'{video_path}/{video_name}.mp4'
    
    text = f'Q: {question} A: {answer}'
    #print(name)
    if os.path.exists(name):
        
        video = read_video(path = name, frames_num=9)
        if len(video)>1:
            #print(len(video))
            image = image_grid(video,3,3)

            image = preprocess(image).unsqueeze(0).to(device)
            with torch.no_grad():
                prefix = clip_model.encode_image(image).cpu()
            #d["clip_embedding"] = i
            all_embeddings.append(prefix)
            all_captions.append(text)
    
with open(out_path, 'wb') as f:
    pickle.dump({"clip_embedding": torch.cat(all_embeddings, dim=0), "captions": all_captions}, f)

print('Done')
print("%0d embeddings saved " % len(all_embeddings))

  0%|          | 0/27570 [00:00<?, ?it/s]

In [None]:

import gc
import io


import random
import numpy as np

import torchvision
import transformers
import more_itertools
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
from torch.utils.data import Dataset
from tqdm import tqdm
from dataclasses import dataclass, field
import torchvision.transforms as T
import torchvision.transforms.functional as TF
import cv2
from PIL import Image
import pickle
import sys
from tqdm.contrib import tzip
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as nnf
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

import os
import pickle
import sys
import argparse
import json
from typing import Tuple, Optional, Union
from torch.cuda.amp import autocast

from transformers.optimization import Adafactor, AdafactorSchedule
import wandb
import torch

from torch.utils.checkpoint import checkpoint_sequential

class ClipCocoDataset(Dataset):
    
    def __init__(self, data_path: str,  prefix_length= 50, gpt2_type = "sberbank-ai/rugpt3large_based_on_gpt2",
                 normalize_prefix=False):
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.prefix_length = prefix_length
        self.normalize_prefix = normalize_prefix
        with open(data_path, 'rb') as f:
            all_data = pickle.load(f)
        print("Data size is %0d" % len(all_data["clip_embedding"]))
        sys.stdout.flush()
        self.prefixes = all_data["clip_embedding"]
        captions_raw = all_data["captions"]
        
        #self.image_ids = [caption["image_id"] for caption in captions_raw]
        
        self.captions = captions_raw
        
        
        self.captions_tokens = []
        self.caption2embedding = []
        max_seq_len = 0
        i=0
        for caption in tqdm(captions_raw):
                self.captions_tokens.append(torch.tensor(self.tokenizer.encode(caption), dtype=torch.int64))
                self.caption2embedding.append(self.prefixes[i])
                i+=1
                max_seq_len = max(max_seq_len, self.captions_tokens[-1].shape[0])
            # self.max_seq_len = max_seq_len
        #del self.captions_tokens
        #del self.caption2embedding
        #gc.collect()
        #with open(f"{data_path[:-4]}_tokens.pkl", 'wb') as f:
        #        pickle.dump([self.captions_tokens, self.caption2embedding, max_seq_len], f)
       
    
    
        all_len = torch.tensor([len(self.captions_tokens[i]) for i in range(len(self))]).float()
        self.max_seq_len = min(int(all_len.mean() + all_len.std() * 10), int(all_len.max()))

    def pad_tokens(self, item: int):
        tokens = self.captions_tokens[item]
        padding = self.max_seq_len - tokens.shape[0]
        if padding > 0:
            tokens = torch.cat((tokens, torch.zeros(padding, dtype=torch.int64) - 1))
            self.captions_tokens[item] = tokens
        elif padding < 0:
            tokens = tokens[:self.max_seq_len]
            self.captions_tokens[item] = tokens
        mask = tokens.ge(0)  # mask is zero where we out of sequence
        tokens[~mask] = 0
        mask = mask.float()
        mask = torch.cat((torch.ones(self.prefix_length), mask), dim=0)  # adding prefix mask
        return tokens, mask
    
    def __len__(self) -> int:
        return len(self.captions_tokens)

   

    def __getitem__(self, item):
        tokens, mask = self.pad_tokens(item)
        prefix = self.prefixes[item]
        if self.normalize_prefix:
            prefix = prefix.float()
            prefix = prefix / prefix.norm(2, -1)
        return tokens, mask, prefix

    

In [11]:
dataset = ClipCocoDataset('Features_train_full_ru.pkl', prefix_length=50, normalize_prefix=False)

Data size is 27570


100%|██████████| 27570/27570 [00:03<00:00, 7264.12it/s]


In [17]:
! e461a6a3bca9f7cec3390a40dc10cdf576ce3252

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
Aborted!


In [None]:

wandb.init(project="clip_caption_video")



class MLP(nn.Module):
    def __init__(self, sizes: Tuple[int, ...], bias=True, act=nn.Tanh):
        super(MLP, self).__init__()
        layers = []
        for i in range(len(sizes) - 1):
            layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=bias))
            if i < len(sizes) - 2:
                layers.append(act())
        self.model = nn.Sequential(*layers)
    
    #@autocast()  
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.model(x)

    
def freeze(
    model,
    freeze_emb=False,
    freeze_ln=False,
    freeze_attn=True,
    freeze_ff=True,
    freeze_other=True,
):
    
    for name, p in model.named_parameters():
    # freeze all parameters except the layernorm and positional embeddings
       
       
        
        name = name.lower()
        if 'ln' in name or 'norm' in name:
            p.requires_grad = not freeze_ln
        elif 'embeddings' in name:
            p.requires_grad = not freeze_emb
        elif 'mlp' in name:
            p.requires_grad = not freeze_ff
        elif 'attn' in name:
            p.requires_grad = not freeze_attn
        else:
            p.requires_grad = not freeze_other
           
    return model

class ClipCaptionModel(nn.Module):
    def __init__(self, backbone, prefix_length: int, prefix_size: int = 768):
          super(ClipCaptionModel, self).__init__()
          self.prefix_length = prefix_length
          """
          ru gpts shit
          
          """
          self.gpt = GPT2LMHeadModel.from_pretrained(backbone)
          #self.gpt = freeze(self.gpt)
          self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
          self.clip_project = MLP((prefix_size, (self.gpt_embedding_size * prefix_length) // 2,
                                  self.gpt_embedding_size * prefix_length))

    def get_dummy_token(self, batch_size: int, device: torch.device) -> torch.Tensor:
        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)
    
    # @autocast() 
    def forward(self, tokens: torch.Tensor, prefix: torch.Tensor, mask: Optional[torch.Tensor] = None,
                labels: Optional[torch.Tensor] = None):

        embedding_text = self.gpt.transformer.wte(tokens)
        prefix_projections = self.clip_project(prefix).view(-1, self.prefix_length, self.gpt_embedding_size)

        embedding_cat = torch.cat((prefix_projections, embedding_text), dim=1)
        if labels is not None:
            dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
            labels = torch.cat((dummy_token, tokens), dim=1)
        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
        return out

  


class ClipCaptionPrefix(ClipCaptionModel):

    def parameters(self, recurse: bool = True):
        return self.clip_project.parameters()

    def train(self, mode: bool = True):
        super(ClipCaptionPrefix, self).train(mode)
        self.gpt.eval()
        return self




def train(dataset, model: ClipCaptionModel, args,
          warmup_steps: int = 5000, output_dir: str = ".", output_prefix: str = ""):

    device = torch.device('cuda')# xm.xla_device()
    #
    batch_size = args.bs
    epochs = args.epochs
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    model = model.to(device)
    
    model = freeze(model)
    model.train()
    optimizer = AdamW(model.parameters(), lr=args.lr,betas=(0.9, 0.995))
    #optimizer = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995))
    #optimizer = SM3(model.parameters(),lr=args.lr)
    #Adafactor(model.parameters(),scale_parameter=True, relative_step=True, warmup_init=True, lr=None)

    train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=epochs * len(train_dataloader)
    )
    #AdafactorSchedule(optimizer)#num_training_steps=epochs * len(train_dataloader
    #save_config(args)
    #print

    for epoch in range(epochs):
        print(f">>> Training epoch {epoch}")
        sys.stdout.flush()
        progress = tqdm(total=len(train_dataloader), desc=output_prefix)
        step=0
        for idx, (tokens, mask, prefix) in enumerate(train_dataloader):
            model.zero_grad()
            step+=1
            tokens, mask, prefix = tokens.to(device), mask.to(device), prefix.to(device, dtype=torch.float32)
            
            outputs = model(tokens, prefix, mask)
            logits = outputs.logits[:, dataset.prefix_length - 1: -1]

            loss = nnf.cross_entropy(logits.reshape(-1, logits.shape[-1]), tokens.flatten(), ignore_index=0)

            segments = 2

           
            #out = checkpoint_sequential(modules, segments, input_var)

            # backpropagate
            loss.backward()
            
            optimizer.step()
            scheduler.step()
            #optimizer.zero_grad()
            progress.set_postfix({"loss": loss.item()})
            
            clipping_value = 0.5 # arbitrary value of your choosing
            #torch.nn.utils.clip_grad_norm(model.parameters(), clipping_value)
            optimizer.step()
            scheduler.step()
            
            wandb.log({"loss":  loss.item()})
            
            progress.update()
            

            del tokens
            del mask
            del prefix
            torch.clear_autocast_cache()
            torch.cuda.empty_cache()
            
            if (idx + 1) % 7000 == 0:
                torch.save(
                    model.state_dict(),
                    
                    os.path.join(output_dir, f"{output_prefix}_latest.pt"),
                )
        progress.close()
        if epoch % args.save_every ==0:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch:03d}.pt"),
            )
    return model



class Args():
    def __init__(self):
        self.backbone = 'sberbank-ai/rugpt3small_based_on_gpt2'
        self.data = 'Features_train_full_ru.pkl'
        self.out_dir = 'checkpoints_larger'
        self.prefix = 'prefix_1'
        self.epochs = 10
        self.save_every = 1
        self.prefix_length = 50
        self.bs = 20
        self.only_prefix = False
        self.lr = 5e-5
        


def main():

    
    args = Args()
    wandb.config = {
      "learning_rate": args.lr,
      "epochs": args.epochs,
      "batch_size": args.bs
    }

    prefix_length = args.prefix_length

    dataset = ClipCocoDataset(args.data, prefix_length)
    
   
    #model_path = 'prefix_1-003.pt'
    model = ClipCaptionModel(backbone = 'sberbank-ai/rugpt3large_based_on_gpt2', prefix_length = 50)
    # model.load_state_dict(torch.load(model_path, map_location='cpu')) 
    print("Train both prefix and GPT")
    sys.stdout.flush()
    train(dataset, model, args, output_dir=args.out_dir, output_prefix=args.prefix)

In [None]:
!rm -r checkpoints*

In [None]:
main()

In [None]:
!cp /content/checkpoints/prefix_1-007.pt /content/drive/MyDrive/4

In [None]:
!cp /content/drive/MyDrive/4/prefix_1-007.pt .

In [None]:
!pip install tg-logger

In [None]:
import clip

In [None]:
device = 'cuda'
clip_model, preprocess = clip.load("ViT-L/14@336px", device=device, jit=False)
tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3large_based_on_gpt2')
prefix_length= 50
model_path = 'prefix_1-007.pt'
model = ClipCaptionModel(backbone = 'gpt2', prefix_length = 50)
model.load_state_dict(torch.load(model_path, map_location='cpu')) 
model.to(device)
None

In [None]:
import io
import os
import PIL
import random
import numpy as np
import torch
import torchvision
import transformers
import more_itertools
import numpy as np
import matplotlib.pyplot as plt
#from tqdm import tqdm
import pandas as pd
from torch.utils.data import Dataset
#from tqdm import tqdm
from dataclasses import dataclass, field
import torchvision.transforms as T
import torchvision.transforms.functional as TF
import cv2
from PIL import Image
def image_grid(imgs, rows, cols):
    pils = imgs
    
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid
def read_video(path, transform=None, frames_num=9, window=30):
    frames = []
    cap = cv2.VideoCapture(path)
    
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    N = length//(frames_num)
    #print(length)
    #counter = 
    
    current_frame = 1
    for i in range(length):
    
        #frameId = int(round(cap.get(current_frame))) 
        #print(current_frame)
        ret, frame = cap.read(current_frame)
    
        
        
        
        
        if ret and i==current_frame and len(frames)<frames_num:
            size = 193, 193
            frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            frame.thumbnail(size, Image.ANTIALIAS)
            
            frames.append(frame)
            current_frame += N
        
       
        #print(current_frame)
        #cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame)
        
        
    cap.release()
    #print(frames)
    return frames



def filter_ngrams(output_text):
    a_pos = output_text.find(' A:')
    sec_a_pos = output_text.find(' A:', a_pos + 1)
    
    return output_text[:sec_a_pos]

def generate2(
        model,
        tokenizer,
        tokens=None,
        prompt='',
        embed=None,
        entry_count=1,
        entry_length=67,  # maximum number of words
        top_p=0.98,
        temperature=1.,
        stop_token = '.',
):
    model.eval()
    generated_num = 0
    generated_list = []
    stop_token_index = tokenizer.encode(stop_token)[0]
    filter_value = -float("Inf")
    device = next(model.parameters()).device

    with torch.no_grad():

        for entry_idx in range(entry_count):
            if not tokens:
                    tokens = torch.tensor(tokenizer.encode(prompt))
                    #print('tokens',tokens)
                    tokens = tokens.unsqueeze(0).to(device)
                    
            emb_tokens = model.gpt.transformer.wte(tokens)
            
            if embed is not None:
                generated = torch.cat((embed, emb_tokens), dim=1)
            else:
                generated = emb_tokens

            for i in range(entry_length):

                outputs = model.gpt(inputs_embeds=generated)
                logits = outputs.logits
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(nnf.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                                                    ..., :-1
                                                    ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value
                #
                top_k = 2000 
                top_p = 0.98
                #print(logits)
                #next_token = transformers.top_k_top_p_filtering(logits.to(torch.int64).unsqueeze(0), top_k=top_k, top_p=top_p)
                next_token = torch.argmax(logits, -1).unsqueeze(0)
                next_token_embed = model.gpt.transformer.wte(next_token)

                if tokens is None:
                    tokens = next_token
                else:
                    tokens = torch.cat((tokens, next_token), dim=1)
                generated = torch.cat((generated, next_token_embed), dim=1)
               
                if stop_token_index == next_token.item():
                    break

            output_list = list(tokens.squeeze().cpu().numpy())
            
            output_text = tokenizer.decode(output_list)
            output_text = filter_ngrams(output_text)
            generated_list.append(output_text)

    return generated_list[0]
#from tqdm import tqdm, trange


def _to_caption(pil_image,prompt=''):
    device = 'cuda:0'
    image = preprocess(pil_image).unsqueeze(0).to(device)
    with torch.no_grad():
        
        prefix = clip_model.encode_image(image).to(device, dtype=torch.float32)
        prefix_embed = model.clip_project(prefix).reshape(1, prefix_length, -1)
        if prompt:
            generated_text_prefix = generate2(model, tokenizer, prompt=prompt, embed=prefix_embed)
        else:
            generated_text_prefix = generate2(model, tokenizer, embed=prefix_embed)
    return generated_text_prefix.replace('\n',' ').replace('\xa0','')



In [None]:





import datetime

import time
import numpy as np
from PIL import Image
import json
from torchvision.transforms import functional as TF


import pytz
import argparse



from telebot import types
import tg_logger
import logging
import telebot 





import json



import numpy as np

def unique_list(l):
    ulist = []
    [ulist.append(x) for x in l if x not in ulist]
    return ulist


boot_time = time.time()
boot_date = datetime.datetime.now(tz=pytz.timezone("Europe/Moscow"))

# ------------- flask config -------------


# ------------- bot config -------------
WEBHOOK_TOKEN = 'aa'
BOT_TOKEN = '5676745030:AAEcXUG-wF-IMBTbDFl11ZhXwlkVKqOvaMM'
bot = telebot.TeleBot(BOT_TOKEN)

# ------------- log ---------------
users = ['241154130']

alpha_logger = logging.getLogger()
alpha_logger.setLevel(logging.INFO)
tg_logger.setup(alpha_logger, token="1227347441:AAEnih283opCWcQLFcbghBXc_t1tIp64QXA", users=users)

logger = logging.getLogger("tg-bot-tti")




import re

from PIL import Image






@bot.message_handler(commands=['help', 'start'])
def say_welcome(message):
    '''Displaying the bot's start interface'''

    logger.info(f'</code>@{message.from_user.username}<code> ({message.chat.id}) used /start or /help')
    bot.send_message(message.chat.id,
                     """ Text2Image  generate faces here """,
                     parse_mode='html')

#from PIL import Image
#@bot.message_handler(content_types=['video'])
#def get_file(message):
#    file_name = message.json['video']['file_name']
#    file_info = bot.get_file(message.video.file_id)
#    with open(file_name, "wb") as f:
#        file_content = bot.download_file(file_info.file_path)
#        f.write(file_content)
#   bot.reply_to(message, f"OK. Сохранил {file_name}")


@bot.message_handler(content_types=['video'])
def photo(message):
    q = message.caption
    
    logger.info(f'{message.from_user.username} {q}')
    file_name = message.json['video']['file_name']
    file_info = bot.get_file(message.video.file_id)
    with open(file_name, "wb") as f:
        file_content = bot.download_file(file_info.file_path)

        f.write(file_content)
    try:
        path = file_name


        video = read_video(path = path, transform = None,frames_num=4)
        i = image_grid (video,2,2)
        ans = _to_caption(i, prompt=f'Q:{q} A:')

        #image = PIL.Image.open("image.jpg")
        #ans = _to_caption(image).replace('<|endoftext|>','')
        #ans = ' '.join(unique_list(ans.split()))
        #print(f'{message.from_user.username} {ans}')
        logger.info(f'{message.from_user.username} {ans}')
        #bot.send_message(message.chat.id,ans)
        #class_ = clf(image)[0]
        bot.send_message(message.chat.id, f'{ans.split("A:")[1]}' )
    except Exception as e:
         bot.send_message(message.chat.id, e )

        
        
      
    

if __name__ == '__main__':

  
  bot.polling(none_stop=True)