SociallyIneptWeeb · totoluto · May 10, 2023 · May 10, 2023
diff --git a/.env.sample b/.env.sample
@@ -81,6 +81,10 @@ REQUEST_TIMEOUT=4
 # Use this website to select the correct language code according to ISO 639-1 https://www.andiamo.co.uk/resources/iso-language-codes/
 TARGET_LANGUAGE_CODE=ja
 
+# The language you are talking
+# Use this website to select the correct language code according to ISO 639-1 https://www.andiamo.co.uk/resources/iso-language-codes/
+MAIN_LANGUAGE_CODE=en
+
 # Position of Subtitle, offset from bottom middle of screen
 OFFSET_X=0
 OFFSET_Y=-200

diff --git a/docs/ENV.md b/docs/ENV.md
@@ -6,6 +6,18 @@ First, copy [.env.sample](../.env.sample) to .env by running the following comma
 
 Now open .env in a text editor of your choice and update the variables. Below is a more detailed description of each environment variable
 
+## Language
+
+The MAIN_LANGUAGE is the language you speak in. By default, it is set to `en` because most of the users talk english and therefor want there subtitles to be in english. 
+But if you want to change the subtitles or want to speak your language instead of english you can do so by changing the value of MAIN_LANGUAGE to the corresponding one. (Check the [language code](https://www.andiamo.co.uk/resources/iso-language-codes/))
+
+Make sure to have the correct `TARGET_LANGUAGE` set to the correct input language. It helps Whisper recognize the language and transcibe it to text.
+By default it is set to `ja` and Whisper expects to get an japanese input and if it doesn't it'll get an error sometimes. 
+
+Like this you can watch you're favorite K-Drama with your own subtitles.
+It may not work with every Language because Whisper is not trained for all languages. 
+Check [here](https://help.openai.com/en/articles/7031512-whisper-api-faq) and search for `What languages are supported?`.
+
 ## Logging 
 
 This variable can be set to either _True_ or _False_. Set to _True_ if you would like to see more detailed logging from the terminal when running the python scripts.

diff --git a/src/modules/asr.py b/src/modules/asr.py
@@ -3,26 +3,64 @@
 
 import requests
 from dotenv import load_dotenv
+import deepl
+import googletrans
 
 load_dotenv()
 
 BASE_URL = getenv('WHISPER_BASE_URL')
+MAIN_LANGUAGE = getenv('MAIN_LANGUAGE_CODE')
+USE_DEEPL = getenv('USE_DEEPL', 'False').lower() in ('true', '1', 't')
+DEEPL_AUTH_KEY = getenv('DEEPL_AUTH_KEY')
 REQUEST_TIMEOUT = int(getenv('REQUEST_TIMEOUT'))
 SAMPLE_JP_FILEPATH = Path(__file__).resolve().parent.parent / r'audio\samples\japanese_speech_sample.wav'
 SAMPLE_EN_FILEPATH = Path(__file__).resolve().parent.parent / r'audio\samples\english_speech_sample.wav'
 
 
 def speech_to_text(filepath, task, language):
-    try:
-        with open(filepath, 'rb') as infile:
-            files = {'audio_file': infile}
-            r = requests.post(f'{BASE_URL}/asr?task={task}&language={language}&output=json',
-                              files=files,
-                              timeout=REQUEST_TIMEOUT)
+    # Set DeepL or Google Translator
+    if USE_DEEPL:
+        translator = deepl.Translator(DEEPL_AUTH_KEY)
+    else:
+        translator = googletrans.Translator()
 
-        if r.status_code == 404:
-            print('Unable to reach Whisper, ensure that it is running, or the WHISPER_BASE_URL variable is set correctly')
-            return None
+    try:
+        if MAIN_LANGUAGE == "en" or task == "transcribe":
+            # If the MAIN_LANGUAGE is english whisper is able to do the translating for us
+            # If the task is 'transcribe' we don't need to translate it to the MAIN_LANGUAGE
+            # There are only two use cases for transcribe:
+            # One is the sample test
+            # The other one is used in 'voice_translator.py' and it get's translated to the target language anyway
+            with open(filepath, 'rb') as infile:
+                files = {'audio_file': infile}
+                r = requests.post(f'{BASE_URL}/asr?task={task}&language={language}&output=json',
+                                  files=files,
+                                  timeout=REQUEST_TIMEOUT)
+                if r.status_code == 404:
+                    print(
+                        'Unable to reach Whisper, ensure that it is running, or the WHISPER_BASE_URL variable is set correctly')
+                    return None
+                else:
+                    return r.json()['text'].strip()
+        else:
+            # If MAIN_LANGUAGE isn't english Whisper can't translate the text to the desired Language (Whisper always translates to english)
+            if task == "translate":
+                with open(filepath, 'rb') as infile:
+                    files = {'audio_file': infile}
+                    r = requests.post(f'{BASE_URL}/asr?task={"transcribe"}&language={language}&output=json',
+                                      files=files,
+                                      timeout=REQUEST_TIMEOUT)
+                if r.status_code == 404:
+                    print(
+                        'Unable to reach Whisper, ensure that it is running, or the WHISPER_BASE_URL variable is set correctly')
+                    return None
+                else:
+                    speech = r.json()['text'].strip()
+                    if USE_DEEPL:
+                        translated_speech = translator.translate_text(speech, target_lang=MAIN_LANGUAGE)
+                    else:
+                        translated_speech = translator.translate(speech, dest=MAIN_LANGUAGE).text
+                    return translated_speech
 
     except requests.exceptions.Timeout:
         print('Request timeout')
@@ -32,8 +70,6 @@ def speech_to_text(filepath, task, language):
         print(f'An unknown error has occurred: {e}')
         return None
 
-    return r.json()['text'].strip()
-
 
 if __name__ == '__main__':
     # test if whisper is up and running

diff --git a/src/voice_translator.py b/src/voice_translator.py
@@ -18,6 +18,7 @@
 USE_DEEPL = getenv('USE_DEEPL', 'False').lower() in ('true', '1', 't')
 DEEPL_AUTH_KEY = getenv('DEEPL_AUTH_KEY')
 TARGET_LANGUAGE = getenv('TARGET_LANGUAGE_CODE')
+MAIN_LANGUAGE = getenv('MAIN_LANGUAGE_CODE')
 MIC_ID = int(getenv('MICROPHONE_ID'))
 RECORD_KEY = getenv('MIC_RECORD_KEY')
 LOGGING = getenv('LOGGING', 'False').lower() in ('true', '1', 't')
@@ -61,20 +62,20 @@ def on_release_key(_):
 
     # transcribe audio
     try:
-        eng_speech = speech_to_text(MIC_AUDIO_PATH, 'transcribe', 'en')
+        speech = speech_to_text(MIC_AUDIO_PATH, 'transcribe', MAIN_LANGUAGE)
     except requests.exceptions.JSONDecodeError:
         print('Too many requests to process at once')
         return
 
-    if eng_speech:
+    if speech:
 
         if USE_DEEPL:
-            translated_speech = translator.translate_text(eng_speech, target_lang=TARGET_LANGUAGE)
+            translated_speech = translator.translate_text(speech, target_lang=TARGET_LANGUAGE)
         else:
-            translated_speech = translator.translate(eng_speech, dest=TARGET_LANGUAGE).text
+            translated_speech = translator.translate(speech, dest=TARGET_LANGUAGE).text
 
         if LOGGING:
-            print(f'English: {eng_speech}')
+            print(f'Said: {speech}')
             print(f'Translated: {translated_speech}')
 
         speak(translated_speech, TARGET_LANGUAGE)