Skip to content

Commit

Permalink
refactor: simplify prompt reasoning chain due to minimal improvement
Browse files Browse the repository at this point in the history
  • Loading branch information
Huanshere committed Nov 11, 2024
1 parent b5b3a5c commit 158d3f4
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 42 deletions.
42 changes: 9 additions & 33 deletions core/prompts_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
## ================================================================
# @ step4_splitbymeaning.py
def get_split_prompt(sentence, num_parts = 2, word_limit = 20):
# ! only support num_parts = 2
language = load_key("whisper.detected_language")
split_prompt = f"""
### Role
Expand All @@ -15,29 +14,19 @@ def get_split_prompt(sentence, num_parts = 2, word_limit = 20):
Your task is to split the given subtitle text into **{num_parts}** parts, each should be less than {word_limit} words.
### Requirements
1. Try to maintain the coherence of the sentence meaning, split according to Netflix subtitle standards, ensuring the two parts are relatively independent.
1. Try to maintain the coherence of the sentence meaning, split according to Netflix subtitle standards, ensuring the parts are relatively independent.
2. The length of each part should be roughly equal, no part should be less than 3 words, but the integrity of the sentence is more important.
3. Prioritize splitting at punctuation marks, such as periods, commas, and conjunctions (e.g., "and", "but", "because", "when", "then", "if", "so", "that").
### Steps
1. Analyze the grammar and structure of the given text.
2. Provide 2 different ways to split the text, each with different split points, output complete sentences (do not change any letters or punctuation), insert [br] tags at the split positions.
3. Briefly compare and evaluate the above 2 split methods, considering readability, grammatical structure, and contextual coherence, choose the best split method.
4. Give the best split method number, 1 or 2.
### Output Format
Please provide your answer in the following JSON format, <<>> represents placeholders:
Please provide your answer in the following JSON format:
{{
"analysis": "Brief analysis of the text structure and split strategy",
"split_1": "<<The first split method, output complete sentences, insert [br] as a delimiter at the split position. e.g. this is the first part [br] this is the second part.>>",
"split_2": "<<The second split method>>",
"eval": "<<Unified brief evaluation of the 2 split methods, written in one sentence, no line breaks>>",
"best": "<<The best split method number, 1 or 2>>"
"analysis": "<<Brief analysis of the text structure and split strategy>>",
"split": "<<Output complete sentences, insert [br] as a delimiter at the split position. e.g. this is the first part [br] this is the second part.>>"
}}
### Given Text
<split_this_sentence>\n{sentence}\n</split_this_sentence>
""".strip()

return split_prompt
Expand Down Expand Up @@ -256,9 +245,8 @@ def get_align_prompt(src_sub, tr_sub, src_part):
### Task Description
Based on the provided original {src_language} and {target_language} original subtitles, as well as the pre-processed split version, you need to:
1. Analyze the word order and structural correspondence between {src_language} and {target_language} subtitles
2. Provide 2 different splitting schemes for the {target_language} subtitles
3. Evaluate these schemes and select the best one
4. Never leave empty lines. If it's difficult to split based on meaning, you may appropriately rewrite the sentences that need to be aligned
2. Split the {target_language} subtitles according to the pre-processed {src_language} split version
3. Never leave empty lines. If it's difficult to split based on meaning, you may appropriately rewrite the sentences that need to be aligned
### Subtitle Data
<subtitles>
Expand All @@ -267,25 +255,13 @@ def get_align_prompt(src_sub, tr_sub, src_part):
Pre-processed {src_language} Subtitles ([br] indicates split points): {src_part}
</subtitles>
### Processing Steps
Please follow these steps and provide the results for each step in the JSON output:
1. Analysis and Comparison: Briefly analyze the word order, sentence structure, and semantic correspondence between {src_language} and {target_language} subtitles. Point out key word correspondences, similarities and differences in sentence patterns, and language features that may affect splitting.
2. Start Alignment: Based on your analysis, provide 2 different alignment methods for {target_language} subtitles according to the format. The split positions in {src_language} must be consistent with the pre-processed {src_language} split version and cannot be changed arbitrarily.
3. Evaluation and Selection: Examine and briefly evaluate the 2 schemes, considering factors such as sentence completeness, semantic coherence, and appropriateness of split points.
4. Best Scheme: Select the best alignment scheme, output only a single number, 1 or 2.
### Output Format
Please complete the following JSON data, where << >> represents placeholders, and return your results in JSON format:
{{
"analysis": "<<Detailed analysis of word order, structure, and semantic correspondence between {src_language} and {target_language} subtitles>>",
"align_1": [
{align_parts_json}
],
"align_2": [
"analysis": "<<Brief analysis of word order, structure, and semantic correspondence between {src_language} and {target_language} subtitles>>",
"align": [
{align_parts_json}
],
"comparison": "<<Brief evaluation and comparison of the 2 alignment schemes>>",
"best": "<<Number of the best alignment scheme, 1 or 2>>"
]
}}
'''

Expand Down
7 changes: 4 additions & 3 deletions core/step3_2_splitbymeaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,12 @@ def split_sentence(sentence, num_parts, word_limit=18, index=-1, retry_attempt=0
"""Split a long sentence using GPT and return the result as a string."""
split_prompt = get_split_prompt(sentence, num_parts, word_limit)
def valid_split(response_data):
if 'best' not in response_data:
return {"status": "error", "message": "Missing required key: `best`"}
if 'split' not in response_data:
return {"status": "error", "message": "Missing required key: `split`"}
return {"status": "success", "message": "Split completed"}

response_data = ask_gpt(split_prompt + ' ' * retry_attempt, response_json=True, valid_def=valid_split, log_title='sentence_splitbymeaning')
best_split = response_data[f"split_{response_data['best']}"]
best_split = response_data["split"]
split_points = find_split_positions(sentence, best_split)
# split the sentence based on the split points
for i, split_point in enumerate(split_points):
Expand Down
10 changes: 4 additions & 6 deletions core/step5_splitforsub.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,13 @@ def align_subs(src_sub: str, tr_sub: str, src_part: str) -> Tuple[List[str], Lis
align_prompt = get_align_prompt(src_sub, tr_sub, src_part)

def valid_align(response_data):
# check if the best is in the response_data
if 'best' not in response_data:
return {"status": "error", "message": "Missing required key: `best`"}
if 'align' not in response_data:
return {"status": "error", "message": "Missing required key: `align`"}
return {"status": "success", "message": "Align completed"}
parsed = ask_gpt(align_prompt, response_json=True, valid_def=valid_align, log_title='align_subs')

best = int(parsed['best'])
align_data = parsed[f'align_{best}']
parsed = ask_gpt(align_prompt, response_json=True, valid_def=valid_align, log_title='align_subs')

align_data = parsed['align']
src_parts = src_part.split('\n')
tr_parts = [item[f'target_part_{i+1}'].strip() for i, item in enumerate(align_data)]

Expand Down

0 comments on commit 158d3f4

Please sign in to comment.