diff --git a/content/post/20230303_transcribe/index.md b/content/post/20230303_transcribe/index.md new file mode 100644 index 0000000..2647f72 --- /dev/null +++ b/content/post/20230303_transcribe/index.md @@ -0,0 +1,80 @@ ++++ +title = "Transcribing Videos with OpenAI's Whisper and ChatGPT APIs" +date = 2023-03-03T00:00:00 +lastmod = 2023-03-03T00:00:00 +draft = true + +# Authors. Comma separated list, e.g. `["Bob Smith", "David Jones"]`. +authors = ["Carl Pearson"] + +tags = [] + +summary = "" + +# Projects (optional). +# Associate this post with one or more of your projects. +# Simply enter your project's folder or file name without extension. +# E.g. `projects = ["deep-learning"]` references +# `content/project/deep-learning/index.md`. +# Otherwise, set `projects = []`. +projects = [] + +# Featured image +# To use, add an image named `featured.jpg/png` to your project's folder. +[image] + # Caption (optional) + caption = "" + + # Focal point (optional) + # Options: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight + focal_point = "Center" + + # Show image only in page previews? + preview_only = false + + +categories = [] + +# Set captions for image gallery. + + ++++ + +At $0.006/min and $0.0002/1k tokens, OpenAI's Whisper and ChatGPT APIs are cheap enough to play with. +Let's do some back-of-the-envelope calculations about a hypothetical transcription system. +It has two pieces - OpenAI's Whisper for the speech-to-text, and then OpenAI's ChatGPT to clean up any transcription errors and break the text into paragraphs. + +First, some back of the envelope calculations. +A fast english speaker reaches around 160 words per minute. +OpenAI says each word is about 0.75 tokens for standard english, meaning our hypothetical fast, non-stop speaker is generating 120 tokens per minute, or 7200 per hour. +If we had to pass those through ChatGPT, (one token out for each token in), we would get the following costs: + +| API | Cost | Cost for 1 hour of speech | +|---------|--------------------|---------------------------| +| Whisper | $0.006 / min | 36 cents | +| ChatGPT | $0.002 / 1k tokens | 2.88 cents | + + +ChatGPT is basically free - Whisper is 30x as expensive -- but the whole thing still comes out to less than $0.50 to transcribe an hour of speech. + +## High-Level Design + +Youtube -> file.webm -> Whisper -> file-1.txt...file-N.txt -> ChatGPT -> clean-1.txt...clean-N.txt -> transcript + +## Design Considerations + + + +The most pressing limit is ChatGPT's limit context: around $4k tokens. +For our purposes, we expect to generate slightly more than one output token for each input token, since ChatGPT will be asked to reproduce the input text with added paragraph breaks. +This means our input is limited to around 2000 tokens per API call. +At 120 tokens per minute, we'd expect to reach that limit after 15 minutes. +This means we need to split the input audio into chunks no longer than 15 minutes each. + + + +If we're not smart about that splitting, we might end up cutting a word in half, which will limit the accuracy off the Whisper API transcription on those words. +We'd rather make shorter chunks that are split when there is silence in the video. +From a monetary cost perspective, it actually doesn't matter how short the chunks are -- OpenAI is billing us for each second of audio and for each word processed by ChatGPT. +Regardless of how short the chunks are, the total audio length and words processed by ChatGPT are the same. + diff --git a/content/publication/20230302_pearson_arxiv/index.md b/content/publication/20230302_pearson_arxiv/index.md new file mode 100644 index 0000000..f85301f --- /dev/null +++ b/content/publication/20230302_pearson_arxiv/index.md @@ -0,0 +1,22 @@ ++++ +title = "[arXiv] Interconnect Bandwidth Heterogeneity on AMD MI250x and Infinity Fabric" +date = 2023-02-28T00:00:00 # Schedule page publish date. +draft = false + +math = false + +tags = ["ROCm"] ++++ + +**Carl Pearson** + + Demand for low-latency and high-bandwidth data transfer between GPUs has +driven the development of multi-GPU nodes. Physical constraints on the +manufacture and integration of such systems has yielded heterogeneous +intra-node interconnects, where not all devices are connected equally. The next +generation of supercomputing platforms are expected to feature AMD CPUs and +GPUs. This work characterizes the extent to which interconnect heterogeneity is +visible through GPU programming APIs on a system with four AMD MI250x GPUs, and +provides several insights for users of such systems. + +* [arxiv](https://arxiv.org/abs/2302.14827)