Spaces:
Build error
Build error
| import gradio as gr | |
| import os | |
| import time | |
| # from omegaconf import OmegaConf | |
| import shutil | |
| import os | |
| # import wget | |
| import time | |
| variable = [] | |
| speech = "" | |
| # context_2 = "" | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| from transformers import AutoTokenizer, AutoModel | |
| import logging | |
| import torch | |
| import os | |
| import base64 | |
| from pyannote.audio import Pipeline | |
| from transformers import pipeline, AutoModelForCausalLM | |
| from diarization_utils import diarize | |
| from huggingface_hub import HfApi | |
| from pydantic import ValidationError | |
| from starlette.exceptions import HTTPException | |
| # from config import model_settings, InferenceConfig | |
| import logging | |
| from pydantic import BaseModel | |
| from pydantic_settings import BaseSettings | |
| from typing import Optional, Literal | |
| logger = logging.getLogger(__name__) | |
| class ModelSettings(BaseSettings): | |
| asr_model: str | |
| assistant_model: Optional[str] | |
| diarization_model: Optional[str] | |
| hf_token: Optional[str] | |
| class InferenceConfig(BaseModel): | |
| task: Literal["transcribe", "translate"] = "transcribe" | |
| batch_size: int = 24 | |
| assisted: bool = False | |
| chunk_length_s: int = 30 | |
| sampling_rate: int = 16000 | |
| language: Optional[str] = None | |
| num_speakers: Optional[int] = None | |
| min_speakers: Optional[int] = None | |
| max_speakers: Optional[int] = None | |
| # from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR | |
| # from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps | |
| device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| # logger.info(f"Using device: {device.type}") | |
| torch_dtype = torch.float32 if device.type == "cpu" else torch.float16 | |
| tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b-32k", trust_remote_code=True) | |
| model = AutoModel.from_pretrained("THUDM/chatglm3-6b-32k", trust_remote_code=True,device_map='auto') | |
| # base_model = "lyogavin/Anima-7B-100K" | |
| # tokenizer = AutoTokenizer.from_pretrained(base_model) | |
| # model = AutoModelForCausalLM.from_pretrained( | |
| # base_model, | |
| # bnb_4bit_compute_dtype=torch.float16, | |
| # # torch_dtype=torch.float16, | |
| # trust_remote_code=True, | |
| # device_map="auto", | |
| # load_in_4bit=True | |
| # ) | |
| # model.eval() | |
| assistant_model = AutoModelForCausalLM.from_pretrained( | |
| "distil-whisper/distil-large-v3", | |
| torch_dtype=torch_dtype, | |
| low_cpu_mem_usage=True, | |
| use_safetensors=True | |
| ) | |
| assistant_model.to(device) | |
| asr_pipeline = pipeline( | |
| "automatic-speech-recognition", | |
| model="openai/whisper-large-v3", | |
| torch_dtype=torch_dtype, | |
| device=device | |
| ) | |
| HfApi().whoami(os.getenv('HF_TOKEN')) | |
| diarization_pipeline = Pipeline.from_pretrained( | |
| checkpoint_path="pyannote/speaker-diarization-3.1", | |
| use_auth_token=os.getenv('HF_TOKEN'), | |
| ) | |
| diarization_pipeline.to(device) | |
| def upload_file(files): | |
| file_paths = [file.name for file in files] | |
| global variable | |
| variable = file_paths | |
| return file_paths | |
| def audio_function(): | |
| # Call the function and return its result to be displayed | |
| time_1 = time.time() | |
| paths = variable | |
| str1 = "processed speech" | |
| for i in paths: | |
| str1 = str1 + i | |
| str1=str1.replace("processed speech","") | |
| print("before processing ffmpeg ! ") | |
| command_to_mp4_to_wav = "ffmpeg -i {} current_out.wav -y" | |
| #-acodec pcm_s16le -ar 16000 -ac 1 | |
| os.system(command_to_mp4_to_wav.format(str1)) | |
| print("after ffmpeg") | |
| # os.system("insanely-fast-whisper --file-name {}_new.wav --task transcribe --hf_token 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".format(str1.replace("mp3",""))) | |
| parameters = InferenceConfig() | |
| generate_kwargs = { | |
| "task": parameters.task, | |
| "language": parameters.language, | |
| "assistant_model": assistant_model if parameters.assisted else None | |
| } | |
| with open("current_out.wav", 'rb') as f: | |
| audio_encoded = base64.b64encode(f.read()).decode("utf-8") | |
| file = base64.b64decode(audio_encoded) | |
| asr_outputs = asr_pipeline( | |
| file, | |
| chunk_length_s=parameters.chunk_length_s, | |
| batch_size=parameters.batch_size, | |
| generate_kwargs=generate_kwargs, | |
| return_timestamps=True, | |
| ) | |
| transcript = diarize(diarization_pipeline, file, parameters, asr_outputs) | |
| global speech | |
| speech = transcript | |
| return transcript,asr_outputs["chunks"],asr_outputs["text"] | |
| def audio_function2(): | |
| # Call the function and return its result to be displayed | |
| # global speech | |
| str2 = speech | |
| time_3 = time.time() | |
| # prompt = " {} generate medical subjective objective assessment plan (soap) notes ?".format(str2) | |
| prompt = """ {} "Did the technician introduce themselves at the start of the video?" | |
| "Did the technician mention their level of experience during the video?" | |
| "Did the technician use the customer's name during the introduction?" | |
| "Did the technician mention the name of the Customer Advisor managing the booking?" | |
| "Did the technician provide a personal recommendation statement in the video?" | |
| "Did the technician mention service plans available to the customer?" | |
| "Did the technician mention genuine Volkswagen parts during the video?" | |
| "Did the technician mention the national parts and labor warranty?" | |
| "Did the technician mention the 7-day price promise during the video?" | |
| "Did the technician thank the customer for choosing Parkway Volkswagen?" | |
| "Did the technician provide a clear NANO statement at the end of the video?" | |
| "Does the video show the vehicle staged on a raised ramp?" | |
| "Does the video show the area around the vehicle clean and organized?" | |
| "Does the video show the vehicle’s bonnet open and upright?" | |
| "Does the technician wear gloves during the video?" | |
| "Does the video show protective items (e.g., seat covers, mats) being used on the vehicle?" | |
| "Does the video show suitable props like a pointer or tire depth gauge being used?" | |
| "Does the video show the technician starting at the nearest point of reference on the vehicle?" | |
| "Does the video demonstrate the use of the Augmented Reality (AR) function?" | |
| "Did the technician verbally explain the condition of at least two items?" / "Does the video show evidence of at least two items (e.g., tires, brakes) being inspected?" | |
| "Did the technician explain the percentage wear of tire treads or brake pads?" / "Does the video show measurement of tire treads or brake pads?" | |
| "Does the video show the technician removing a wheel to demonstrate brake condition clearly?" | |
| "Did the technician provide additional context regarding brake or tire wear?" / "Does the video visually demonstrate brake or tire wear with context?" | |
| "Did the technician explain the consequences of any identified repair areas?" / "Does the video show repair areas or consequences visually?" | |
| "Did the technician verbally compare a new part to a worn part?" / "Does the video show a side-by-side comparison of a new part and a worn part?" | |
| "Does the video include or reference supporting documents (e.g., photographs of identified items)?" """.format(str2) | |
| # model = model.eval() | |
| response, history = model.chat(tokenizer, prompt, history=[]) | |
| print(response) | |
| # del model | |
| # del tokenizer | |
| # torch.cuda.empty_cache() | |
| time_4 = time.time() | |
| # response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history) | |
| # print(response) | |
| # inputs = tokenizer(prompt, return_tensors="pt") | |
| # inputs['input_ids'] = inputs['input_ids'].cuda() | |
| # inputs['attention_mask'] = inputs['attention_mask'].cuda() | |
| # generate_ids = model.generate(**inputs, max_new_tokens=4096, | |
| # only_last_logit=True, # to save memory | |
| # use_cache=False, # when run into OOM, enable this can save memory | |
| # xentropy=True) | |
| # output = tokenizer.batch_decode(generate_ids, | |
| # skip_special_tokens=True, | |
| # clean_up_tokenization_spaces=False) | |
| # tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K") | |
| # model = AutoModelForCausalLM.from_pretrained("togethercomputer/LLaMA-2-7B-32K", trust_remote_code=True, torch_dtype=torch.float16,device_map="auto",bnb_4bit_compute_dtype=torch.float16,load_in_4bit=True) | |
| # input_context = "summarize "+" the following {}".format(str2) | |
| # input_ids = tokenizer.encode(input_context, return_tensors="pt").cuda() | |
| # output = model.generate(input_ids, max_new_tokens=512, temperature=0.7) | |
| # output_text = tokenizer.decode(output[0], skip_special_tokens=True) | |
| # print(output_text,"wow what happened ") | |
| # return output | |
| return response,str(int(time_4-time_3)) + " seconds" | |
| with gr.Blocks() as demo: | |
| file_output = gr.File() | |
| upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio","video"], file_count="multiple") | |
| upload_button.upload(upload_file, upload_button, file_output) | |
| gr.Markdown("## Click process audio to display text from audio file") | |
| submit_button = gr.Button("Process Audio") | |
| output_text = gr.Textbox(label="Speech Diarization") | |
| output_text_2 = gr.Textbox(label="Speech chunks") | |
| submit_button.click(audio_function, outputs=[output_text,output_text_2,gr.Textbox(label=" asr_text :")]) | |
| gr.Markdown("## Click the Summarize to display call summary") | |
| submit_button = gr.Button("Summarize") | |
| output_text = gr.Textbox(label="Sales Call Notes") | |
| submit_button.click(audio_function2, outputs=[output_text,gr.Textbox(label="Time Taken :")]) | |
| demo.launch() | |