diff --git a/README.md b/README.md
index be6eb7c..62a81f9 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,7 @@ The following arguments are available:
How the extractions are performed
-* `--use_extractions` (REQUIRED): if you only want to use certain extraction types, you can pass their slug found in either `includes/ai/config.yaml`, `includes/lookup/config.yaml` `includes/pattern/config.yaml` (e.g. `pattern_ipv4_address_only`). Default if not passed, no extractions applied. You can also pass a catch all wildcard `*` which will match all extraction paths (e.g. `pattern_*` would run all extractions starting with `pattern_`)
+* `--use_extractions` (REQUIRED): if you only want to use certain extraction types, you can pass their slug found in either `includes/ai/config.yaml`, `includes/lookup/config.yaml` `includes/pattern/config.yaml` (e.g. `pattern_ipv4_address_only`). Default if not passed, no extractions applied. You can also pass a catch all wildcard `*` which will match all extraction paths (e.g. `'pattern_*'` would run all extractions starting with `pattern_` -- make sure to use quotes when using a wildcard)
* Important: if using any AI extractions (`ai_*`), you must set an AI API key in your `.env` file
* Important: if you are using any MITRE ATT&CK, CAPEC, CWE, ATLAS or Location extractions you must set `CTIBUTLER` or NVD CPE or CVE extractions you must set `VULMATCH` settings in your `.env` file
* `--relationship_mode` (REQUIRED): either.
@@ -110,11 +110,13 @@ If any AI extractions, or AI relationship mode is set, you must set the followin
* Provider (env var required `OPENAI_API_KEY`): `openai:`, models e.g.: `gpt-4o`, `gpt-4o-mini`, `gpt-4-turbo`, `gpt-4` ([More here](https://platform.openai.com/docs/models))
* Provider (env var required `ANTHROPIC_API_KEY`): `anthropic:`, models e.g.: `claude-3-5-sonnet-latest`, `claude-3-5-haiku-latest`, `claude-3-opus-latest` ([More here](https://docs.anthropic.com/en/docs/about-claude/models))
* Provider (env var required `GOOGLE_API_KEY`): `gemini:models/`, models: `gemini-1.5-pro-latest`, `gemini-1.5-flash-latest` ([More here](https://ai.google.dev/gemini-api/docs/models/gemini))
- * Provider (env var required `DEEPSEEK_API_KEY`): `deepseek:`, models `deepseek-chat` ([More here](https://api-docs.deepseek.com/quick_start/pricing))
+ * Provider (env var required `DEEPSEEK_API_KEY`): `deepseek:`, models `deepseek-chat` ([More here](https://api-docs.deepseek.com/quick_start/pricing))
* See `tests/manual-tests/cases-ai-extraction-type.md` for some examples
* `--ai_settings_relationships`:
* similar to `ai_settings_extractions` but defines the model used to generate relationships. Only one model can be provided. Passed in same format as `ai_settings_extractions`
* See `tests/manual-tests/cases-ai-relationships.md` for some examples
+* `--ai_check_content`: Passing this flag will get the AI to try and classify the text in the input to 1) determine if it is talking about threat intelligence, and 2) what type of threat intelligence it is talking about. For context, we use this to filter out non-threat intel posts in Obstracts and Stixify. You pass `provider:model` with this flag to determine the AI model you wish to use to perform the check.
+* `--ai_create_attack_flow`: passing this flag will also prompt the AI model (the same entered for `--ai_settings_relationships`) to generate an [Attack Flow](https://center-for-threat-informed-defense.github.io/attack-flow/) for the MITRE ATT&CK extractions to define the logical order in which they are being described. You must pass `--ai_settings_relationships` for this to work.
## Adding new extractions
diff --git a/includes/extractions/ai/config.yaml b/includes/extractions/ai/config.yaml
index d2a4c3d..013c7ea 100644
--- a/includes/extractions/ai/config.yaml
+++ b/includes/extractions/ai/config.yaml
@@ -725,7 +725,7 @@ ai_mitre_attack_enterprise:
version: 1.0.0
prompt_base: 'Extract all references to MITRE ATT&CK Enterprise tactics, techniques, groups, data sources, mitigations, software, and campaigns described in the text. These references may not be explicit in the text so you should be careful to account for the natural language of the text your analysis. Do not include MITRE ATT&CK ICS or MITRE ATT&CK Mobile in the results.'
prompt_helper: 'If you are unsure, you can learn more about MITRE ATT&CK Enterprise here: https://attack.mitre.org/matrices/enterprise/'
- prompt_conversion: 'Convert all extractions into the corresponding ATT&CK ID.'
+ prompt_conversion: 'You should respond with only the ATT&CK ID.'
test_cases: ai_mitre_attack_enterprise
stix_mapping: ctibutler-mitre-attack-enterprise-id
@@ -740,7 +740,7 @@ ai_mitre_attack_mobile:
version: 1.0.0
prompt_base: 'Extract all references to MITRE ATT&CK Mobile tactics, techniques, groups, data sources, mitigations, software, and campaigns described in the text. These references may not be explicit in the text so you should be careful to account for the natural language of the text your analysis. Do not include MITRE ATT&CK ICS or MITRE ATT&CK Enterprise in the results.'
prompt_helper: 'If you are unsure, you can learn more about MITRE ATT&CK Enterprise here: https://attack.mitre.org/matrices/mobile/'
- prompt_conversion: 'Convert all extractions into the corresponding ATT&CK ID.'
+ prompt_conversion: 'You should respond with only the ATT&CK ID.'
test_cases: ai_mitre_attack_mobile
stix_mapping: ctibutler-mitre-attack-mobile-id
@@ -755,7 +755,7 @@ ai_mitre_attack_ics:
version: 1.0.0
prompt_base: 'Extract all references to MITRE ATT&CK ICS tactics, techniques, groups, data sources, mitigations, software, and campaigns described in the text. These references may not be explicit in the text so you should be careful to account for the natural language of the text your analysis. Do not include MITRE ATT&CK Mobile or MITRE ATT&CK Enterprise in the results.'
prompt_helper: 'If you are unsure, you can learn more about MITRE ATT&CK Enterprise here: https://attack.mitre.org/matrices/ics/'
- prompt_conversion: 'Convert all extractions into the corresponding ATT&CK ID.'
+ prompt_conversion: 'You should respond with only the ATT&CK ID.'
test_cases: ai_mitre_attack_ics
stix_mapping: ctibutler-mitre-attack-ics-id
@@ -772,7 +772,7 @@ ai_mitre_capec:
version: 1.0.0
prompt_base: 'Extract all references to a MITRE CAPEC object from the text.'
prompt_helper: 'If you are unsure, you can learn more about MITRE CAPEC here: https://capec.mitre.org/'
- prompt_conversion: 'Convert all extractions into the corresponding CAPEC ID in the format `CAPEC-ID`'
+ prompt_conversion: 'You should respond with only the CAPEC ID.'
test_cases: ai_mitre_capec
stix_mapping: ctibutler-mitre-capec-id
@@ -789,7 +789,7 @@ ai_mitre_cwe:
version: 1.0.0
prompt_base: 'Extract all references to a MITRE CWE object from the text.'
prompt_helper: 'If you are unsure, you can learn more about MITRE CAPEC here: https://cwe.mitre.org/'
- prompt_conversion: 'Convert all extractions into the corresponding CWE ID in the format `CWE-ID`'
+ prompt_conversion: 'You should respond with only the CWE ID.'
test_cases: ai_mitre_cwe
stix_mapping: ctibutler-mitre-cwe-id
diff --git a/includes/tests/test_cases.yaml b/includes/tests/test_cases.yaml
index 2650dde..bd08712 100644
--- a/includes/tests/test_cases.yaml
+++ b/includes/tests/test_cases.yaml
@@ -492,8 +492,8 @@ ai_mitre_attack_enterprise:
- 'T1053.005' # attack-pattern--005a06c6-14bf-4118-afa0-ebcd8aebb0c9
- 'T1040' # attack-pattern--3257eb21-f9a7-4430-8de1-d8b6e288f529 , course-of-action--46b7ef91-4e1d-43c5-a2eb-00fa9444f6f4
- 'TA0003' # x-mitre-tactic--5bc1d813-693e-4823-9961-abf9af4b0e92
- - 'Rundll32' # attack-pattern--045d0922-2310-4e60-b5e4-3302302cb3c5
- - 'OS Credential Dumping' # attack-pattern--0a3ead4e-6d47-4ccb-854c-a6a4f9d96b22
+ # hidden as causes ai to get confused - 'Rundll32' # attack-pattern--045d0922-2310-4e60-b5e4-3302302cb3c5
+ # hidden as causes ai to get confused - 'OS Credential Dumping' # attack-pattern--0a3ead4e-6d47-4ccb-854c-a6a4f9d96b22
test_negative_examples:
-
@@ -520,8 +520,8 @@ ai_mitre_attack_mobile:
- 'S0505' # malware--3271c107-92c4-442e-9506-e76d62230ee8
- 'T1630.001' # attack-pattern--0cdd66ad-26ac-4338-a764-4972a1e17ee3
- 'TA0029' # x-mitre-tactic--3e962de5-3280-43b7-bc10-334fbc1d6fa8
- - 'Impair Defenses' # attack-pattern--20b0931a-8952-42ca-975f-775bad295f1a
- - 'Call Log' # attack-pattern--1d1b1558-c833-482e-aabb-d07ef6eae63d
+ # hidden as causes ai to get confused - 'Impair Defenses' # attack-pattern--20b0931a-8952-42ca-975f-775bad295f1a
+ # hidden as causes ai to get confused - 'Call Log' # attack-pattern--1d1b1558-c833-482e-aabb-d07ef6eae63d
test_negative_examples:
-
@@ -541,8 +541,8 @@ generic_mitre_attack_ics_name:
ai_mitre_attack_ics:
test_positive_examples:
- 'TA0111' # x-mitre-tactic--33752ae7-f875-4f43-bdb6-d8d02d341046
- - 'Scripting' # attack-pattern--2dc2b567-8821-49f9-9045-8740f3d0b958
- - 'Program Upload' # attack-pattern--3067b85e-271e-4bc5-81ad-ab1a81d411e3
+ # hidden as causes ai to get confused - 'Scripting' # attack-pattern--2dc2b567-8821-49f9-9045-8740f3d0b958
+ # hidden as causes ai to get confused - 'Program Upload' # attack-pattern--3067b85e-271e-4bc5-81ad-ab1a81d411e3
test_negative_examples:
####### MITRE CAPEC #######
@@ -567,8 +567,8 @@ generic_mitre_capec_name:
ai_mitre_capec:
test_positive_examples:
- 'CAPEC-110' # attack-pattern--7c90bef7-530c-427b-8fb7-f9d3eda9c26a
- - 'Clickjacking' # attack-pattern--ec41b2b3-a3b6-4af0-be65-69e82907dfef
- - 'Overflow Buffers' # attack-pattern--77e51461-7843-411c-a90e-852498957f76
+ # hidden as causes ai to get confused - 'Clickjacking' # attack-pattern--ec41b2b3-a3b6-4af0-be65-69e82907dfef
+ # hidden as causes ai to get confused - 'Overflow Buffers' # attack-pattern--77e51461-7843-411c-a90e-852498957f76
test_negative_examples:
-
@@ -596,8 +596,8 @@ ai_mitre_cwe:
test_positive_examples:
- 'CWE-1023' # weakness--c122031a-5735-54f2-a80b-194da3a2c0e6
- 'CWE-102' # weakness--ad5b3e38-fdf2-5c97-90da-30dad0f1f016
- - 'Use of Redundant Code' # weakness--6dfb4e56-706d-5243-a3eb-6d4e49b16389
- - 'Insufficient Encapsulation' # weakness--b0a3b7a9-fefa-5435-8336-4d2e019597f8
+ # hidden as causes ai to get confused - 'Use of Redundant Code' # weakness--6dfb4e56-706d-5243-a3eb-6d4e49b16389
+ # hidden as causes ai to get confused - 'Insufficient Encapsulation' # weakness--b0a3b7a9-fefa-5435-8336-4d2e019597f8
test_negative_examples:
####### MITRE ATLAS #######
diff --git a/pyproject.toml b/pyproject.toml
index 996fa82..8b49802 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project]
name = "txt2stix"
-version = "0.0.1b5"
+version = "0.0.2"
authors = [{ name = "DOGESEC", email = "support@dogesec.com" }]
description = "txt2stix is a Python script that is designed to identify and extract IoCs and TTPs from text files, identify the relationships between them, convert them to STIX 2.1 objects, and output as a STIX 2.1 bundle."
readme = "README.md"
@@ -23,7 +23,7 @@ dependencies = [
"requests==2.32.3",
"python-dotenv>=1.0.1",
"schwifty>=2024.6.1",
- "stix2extensions @ https://github.com/muchdogesec/stix2extensions/archive/main.zip",
+ "stix2extensions @ https://github.com/muchdogesec/stix2extensions/releases/download/main-2025-02-12-06-23-37/stix2extensions-0.0.3-py3-none-any.whl",
"tld>=0.13",
"tldextract>=5.1.2",
"validators>=0.28.3",
diff --git a/requirements.txt b/requirements.txt
index 6401f79..eb18120 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -51,7 +51,6 @@ sniffio==1.3.1; python_version >= '3.7'
sqlalchemy==2.0.30; python_version >= '3.7'
stix2==3.0.1; python_version >= '3.6'
stix2-patterns==2.0.0; python_version >= '3.6'
-https://github.com/muchdogesec/stix2extensions/archive/main.zip
tenacity==8.3.0; python_version >= '3.8'
tiktoken==0.7.0; python_version >= '3.8'
tld==0.13; python_version >= '3.7' and python_version < '4'
@@ -63,4 +62,5 @@ validators==0.28.3; python_version >= '3.8'
yarl==1.9.4; python_version >= '3.7'
zipp==3.19.1; python_version >= '3.8'
llama-index==0.10.51; python_version >= '3.8'
-base58>=2.1.1; python_version >= '3.8'
\ No newline at end of file
+base58>=2.1.1; python_version >= '3.8'
+stix2extensions @ https://github.com/muchdogesec/stix2extensions/releases/download/main-2025-02-12-06-23-37/stix2extensions-0.0.3-py3-none-any.whl
\ No newline at end of file
diff --git a/tests/data/manually_generated_reports/attack_flow_demo.txt b/tests/data/manually_generated_reports/attack_flow_demo.txt
new file mode 100644
index 0000000..73e90a0
--- /dev/null
+++ b/tests/data/manually_generated_reports/attack_flow_demo.txt
@@ -0,0 +1,7 @@
+Victims receive spear phishing emails with from test@test.com malicious zip files attached named badfile.zip
+
+Due to password protection, the zip files are able to bypass some AV detections.
+
+The zip files are extracted and usually contain a malicious document, such as a .doc, .pdf, or .xls. Some examples are malware.pdf and bad.com
+
+The extracted files contain malicious macros that connect to a C2 server 1.1.1.1
\ No newline at end of file
diff --git a/tests/manual-tests/cases-standard-tests.md b/tests/manual-tests/cases-standard-tests.md
index 4b885f6..5ec58be 100644
--- a/tests/manual-tests/cases-standard-tests.md
+++ b/tests/manual-tests/cases-standard-tests.md
@@ -362,4 +362,50 @@ python3 txt2stix.py \
--confidence 100 \
--use_extractions lookup_disarm_name \
--report_id 8cb2dbf0-136f-4ecb-995c-095496e22abc
+```
+
+### ai check content
+
+```shell
+python3 txt2stix.py \
+ --relationship_mode standard \
+ --input_file tests/data/extraction_types/all_cases.txt \
+ --name 'Test AI Content check' \
+ --tlp_level clear \
+ --confidence 100 \
+ --use_extractions 'pattern_*' \
+ --ai_content_check openai:gpt-4o \
+ --report_id 4fa18f2d-278b-4fd4-8470-62a8807d35ad
+```
+
+### attack flow demo
+
+no indicators
+
+```shell
+python3 txt2stix.py \
+ --relationship_mode standard \
+ --ai_settings_relationships openai:gpt-4o \
+ --input_file tests/data/manually_generated_reports/attack_flow_demo.txt \
+ --name 'Test MITRE ATT&CK Flow demo' \
+ --tlp_level clear \
+ --confidence 100 \
+ --use_extractions 'ai_mitre_attack_enterprise' \
+ --ai_create_attack_flow \
+ --report_id c0fef67c-720b-4184-a62e-ea465b4d89b5
+```
+
+with indicators
+
+```shell
+python3 txt2stix.py \
+ --relationship_mode standard \
+ --ai_settings_relationships openai:gpt-4o \
+ --input_file tests/data/manually_generated_reports/attack_flow_demo.txt \
+ --name 'Test MITRE ATT&CK Flow demo with iocs' \
+ --tlp_level clear \
+ --confidence 100 \
+ --use_extractions ai_mitre_attack_enterprise,'pattern_*' \
+ --ai_create_attack_flow \
+ --report_id 3b160a8d-12dd-4e7c-aee8-5af6e371b425
```
\ No newline at end of file
diff --git a/txt2stix/ai_extractor/base.py b/txt2stix/ai_extractor/base.py
index 7c16ef8..46cd91b 100644
--- a/txt2stix/ai_extractor/base.py
+++ b/txt2stix/ai_extractor/base.py
@@ -6,139 +6,21 @@
from llama_index.core import PromptTemplate
from llama_index.core.llms.llm import LLM
-from txt2stix.ai_extractor.utils import ExtractionList, ParserWithLogging, RelationshipList, get_extractors_str
+from txt2stix.ai_extractor.prompts import DEFAULT_CONTENT_CHECKER_TEMPL, DEFAULT_EXTRACTION_TEMPL, DEFAULT_RELATIONSHIP_TEMPL, DEFAULT_SYSTEM_PROMPT, ATTACK_FLOW_PROMPT_TEMPL
+from txt2stix.ai_extractor.utils import AttackFlowList, DescribesIncident, ExtractionList, ParserWithLogging, RelationshipList, get_extractors_str
from llama_index.core.utils import get_tokenizer
_ai_extractor_registry: dict[str, 'Type[BaseAIExtractor]'] = {}
class BaseAIExtractor():
- system_prompt = (textwrap.dedent(
- """
-
-
- You are a cyber-security threat intelligence analysis tool responsible for analysing intelligence provided in text files.
-
- You have a deep understanding of cybersecurity and threat intelligence concepts.
-
- IMPORTANT: You must always deliver your work as a computer-parsable output in JSON format. All output from you will be parsed with pydantic for further processing.
-
-
- """
- ))
- extraction_template = PromptTemplate(textwrap.dedent(
- """
-
-
- You are a cyber-security threat intelligence analysis tool responsible for analysing intelligence provided in text files.
-
- You have a deep understanding of cybersecurity and threat intelligence concepts.
-
- IMPORTANT: You must always deliver your work as a computer-parsable output in JSON format. All output from you will be parsed with pydantic for further processing.
-
-
-
-
-
- Using the report text printed between the `` tags, you should extract the Indicators of Compromise (IoCs) and Tactics, Techniques, and Procedures (TTPs) being described in it.
-
- The document can contain the same IOC or TTP one or more times. Only create one record for each extraction -- the extractions must be unique!
-
- Only one JSON object should exist for each unique value.
-
-
-
-
-
- Think about your answer first before you respond. The accuracy of your response is very important as this data will be used for operational purposes.
-
- If you don't know the answer, reply with success: false, do not ever try to make up an answer.
-
-
-
-
-
- {input_file}
-
-
-
-
-
- {extractors}
-
-
-
-
-
- IMPORTANT: Only include a valid JSON document in your response and no other text. The JSON document should be minified!.
-
- Response MUST be in JSON format.
-
- Response MUST start with: {"success":
-
- """
- ))
-
- relationship_template = PromptTemplate(textwrap.dedent(
- """
-
-
- You are a cyber-security threat intelligence analysis tool responsible for analysing intelligence provided in text files.
-
- You have a deep understanding of cybersecurity and threat intelligence concepts.
-
- IMPORTANT: You must always deliver your work as a computer-parsable output in JSON format. All output from you will be parsed with pydantic for further processing.
-
-
-
-
-
- The tag `` contains all the observables and TTPs that were extracted from the document provided in ``
-
- Please capture the relationships between the extractions and describe them using NLP techniques.
-
- A relationship MUST have different source_ref and target_ref
-
- Select an appropriate relationship_type from ``.
-
- Only use `related-to` or any other vague `relationship_type` as a last resort.
-
- The value of relationship_type MUST be clear, and it SHOULD NOT describe everything as related-to each other unless they are related in context of the `
-
- IMPORTANT: Only include a valid JSON document in your response and no other text. The JSON document should be minified!.
-
-
-
-
-
- Think about your answer first before you respond. The accuracy of your response is very important as this data will be used for operational purposes.
-
- If you don't know the answer, reply with success: false, do not ever try to make up an answer.
-
-
-
-
- {input_file}
-
-
-
- {extractions}
-
-
-
- {relationship_types}
-
+ system_prompt = DEFAULT_SYSTEM_PROMPT
+
+ extraction_template = DEFAULT_EXTRACTION_TEMPL
-
+ relationship_template = DEFAULT_RELATIONSHIP_TEMPL
- IMPORTANT: Only include a valid JSON document in your response and no other text. The JSON document should be minified!.
+ content_check_template = DEFAULT_CONTENT_CHECKER_TEMPL
- Response MUST be in JSON format.
-
- Response MUST start with: {"success":
-
- """
- ))
-
def _get_extraction_program(self):
return LLMTextCompletionProgram.from_defaults(
output_parser=ParserWithLogging(ExtractionList),
@@ -154,6 +36,28 @@ def _get_relationship_program(self):
verbose=True,
llm=self.llm,
)
+
+ def _get_content_checker_program(self):
+ return LLMTextCompletionProgram.from_defaults(
+ output_parser=ParserWithLogging(DescribesIncident),
+ prompt=self.content_check_template,
+ verbose=True,
+ llm=self.llm,
+ )
+
+ def check_content(self, text) -> DescribesIncident:
+ return self._get_content_checker_program()(context_str=text)
+
+ def _get_attack_flow_program(self):
+ return LLMTextCompletionProgram.from_defaults(
+ output_parser=ParserWithLogging(AttackFlowList),
+ prompt=ATTACK_FLOW_PROMPT_TEMPL,
+ verbose=True,
+ llm=self.llm,
+ )
+
+ def extract_attack_flow(self, input_text, extractions, relationships):
+ return self._get_attack_flow_program()(document=input_text, extractions=extractions, relationships=relationships)
def extract_relationships(self, input_text, extractions, relationship_types: list[str]) -> RelationshipList:
return self._get_relationship_program()(relationship_types=relationship_types, input_file=input_text, extractions=extractions)
diff --git a/txt2stix/ai_extractor/prompts.py b/txt2stix/ai_extractor/prompts.py
new file mode 100644
index 0000000..1323052
--- /dev/null
+++ b/txt2stix/ai_extractor/prompts.py
@@ -0,0 +1,164 @@
+
+from llama_index.core import PromptTemplate, ChatPromptTemplate
+import textwrap
+from llama_index.core.base.llms.types import ChatMessage, MessageRole
+
+
+DEFAULT_SYSTEM_PROMPT = textwrap.dedent(
+"""
+
+
+ You are a cyber-security threat intelligence analysis tool responsible for analysing intelligence provided in text files.
+
+ You have a deep understanding of cybersecurity and threat intelligence concepts.
+
+ IMPORTANT: You must always deliver your work as a computer-parsable output in JSON format. All output from you will be parsed with pydantic for further processing.
+
+
+"""
+)
+
+DEFAULT_EXTRACTION_TEMPL = PromptTemplate(textwrap.dedent(
+ """
+
+ You are a cyber-security threat intelligence analysis tool responsible for analysing intelligence provided in text files.
+ You have a deep understanding of cybersecurity and threat intelligence concepts.
+ IMPORTANT: You must always deliver your work as a computer-parsable output in JSON format. All output from you will be parsed with pydantic for further processing.
+
+
+ Using the report text printed between the `` tags, you should extract the Indicators of Compromise (IoCs) and Tactics, Techniques, and Procedures (TTPs) being described in it.
+ The document can contain the same IOC or TTP one or more times. Only create one record for each extraction -- the extractions must be unique!
+ Only one JSON object should exist for each unique value.
+
+
+ Think about your answer first before you respond. The accuracy of your response is very important as this data will be used for operational purposes.
+ If you don't know the answer, reply with success: false, do not ever try to make up an answer.
+
+
+ {input_file}
+
+
+ {extractors}
+
+
+ IMPORTANT: Only include a valid JSON document in your response and no other text. The JSON document should be minified!.
+ Response MUST be in JSON format.
+ Response MUST start with: {"success":
+
+ """
+))
+
+
+DEFAULT_RELATIONSHIP_TEMPL = PromptTemplate(textwrap.dedent(
+"""
+
+ You are a cyber-security threat intelligence analysis tool responsible for analysing intelligence provided in text files.
+ You have a deep understanding of cybersecurity and threat intelligence concepts.
+ IMPORTANT: You must always deliver your work as a computer-parsable output in JSON format. All output from you will be parsed with pydantic for further processing.
+
+
+ The tag `` contains all the observables and TTPs that were extracted from the document provided in ``
+ Please capture the relationships between the extractions and describe them using NLP techniques.
+ A relationship MUST have different source_ref and target_ref
+ Select an appropriate relationship_type from ``.
+ Only use `related-to` or any other vague `relationship_type` as a last resort.
+ The value of relationship_type MUST be clear, and it SHOULD NOT describe everything as related-to each other unless they are related in context of the `
+ IMPORTANT: Only include a valid JSON document in your response and no other text. The JSON document should be minified!.
+
+
+ Think about your answer first before you respond. The accuracy of your response is very important as this data will be used for operational purposes.
+ If you don't know the answer, reply with success: false, do not ever try to make up an answer.
+
+
+{input_file}
+
+
+{extractions}
+
+
+{relationship_types}
+
+
+ IMPORTANT: Only include a valid JSON document in your response and no other text. The JSON document should be minified!.
+ Response MUST be in JSON format.
+ Response MUST start with: {"success":
+
+"""
+))
+
+DEFAULT_CONTENT_CHECKER_TEMPL = PromptTemplate("""
+
+ You are a cyber security threat intelligence analyst.
+ Your job is to review report that describe a cyber security incidents.
+ Examples include malware analysis, APT group reports, data breaches, vulnerabilities, or Indicators of Compromise.
+ Some of the documents you are given do not help in this
+ I need you to tell me if the text provided is.
+
+
+ Using the MARKDOWN of the report provided in
+ IMPORTANT: the output should be structured as valid JSON.
+ IMPORTANT: output should not be in markdown, it must be a plain JSON text without any code block
+ IMPORTANT: do not include any comment in the output
+ IMPORTANT: output must start with a `{` and end with a `}` and must not contain "```"
+
+
+{context_str}
+
+
+ Possible Incident Classifications are
+ * Other: the report does not fit into any of the following categories
+ * APT Group
+ * Vulnerability
+ * Data Leak
+ * Malware
+ * Ransomware
+ * Infostealer
+ * Threat Actor
+ * Campaign
+ * Exploit
+ * Cyber Crime
+ * Indicators of Compromise
+ * TTPs
+
+""")
+
+ATTACK_FLOW_PROMPT_TEMPL = ChatPromptTemplate([
+ ChatMessage.from_str("""You are a cyber security threat intelligence analyst.
+Your job is to review report that describe a cyber security incidents.
+Examples include malware analysis, APT group reports, data breaches and vulnerabilities.""", MessageRole.SYSTEM),
+ ChatMessage.from_str("Hi, What would you like me to process for you? the message below must contain the document and the document only", MessageRole.ASSISTANT),
+ ChatMessage.from_str("{document}", MessageRole.USER),
+ ChatMessage.from_str("What are the objects that have been extracted () from the document above?", MessageRole.ASSISTANT),
+ ChatMessage.from_str("{extractions}", MessageRole.USER),
+ ChatMessage.from_str("What are the relationships that have been extracted () between the documents?", MessageRole.USER),
+ ChatMessage.from_str("{relationships}", MessageRole.USER),
+ ChatMessage.from_str("What should I do with all the data that have been provided?", MessageRole.ASSISTANT),
+ ChatMessage.from_str("""Consider all the MITRE ATT&CK Objects extracted from the report and the relationships they have to other objects.
+
+Now I need you to logically define the order of ATT&CK Tactics/Techniques as they are executed in the incident described in the report.
+
+It is possible that the Techniques extracted are not linked to the relevant MITRE ATT&CK Tactic. You should also assign the correct Tactic to a Technique where a Technique belongs to many ATT&CK Tactics in the ATT&CK Matrix if that can correctly be inferred.
+
+You should also provide a short overview about how this technique is described in the report as the name, and a longer version in description.
+
+IMPORTANT: only include the ATT&CK IDs extracted already, do not add any new extractions.
+
+You should deliver a response in JSON as follows
+
+[
+{
+ "position": "",
+ "attack_technique_id": "",
+ "name": "",
+ "description": ""
+},
+{
+ "position": "",
+ "attack_technique_id": "",
+ "name": "",
+ "description": ""
+}
+]""", MessageRole.USER)
+])
\ No newline at end of file
diff --git a/txt2stix/ai_extractor/utils.py b/txt2stix/ai_extractor/utils.py
index d8d78e4..5721d14 100644
--- a/txt2stix/ai_extractor/utils.py
+++ b/txt2stix/ai_extractor/utils.py
@@ -7,7 +7,7 @@
from ..extractions import Extractor
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, RootModel
from llama_index.core.output_parsers import PydanticOutputParser
class Extraction(BaseModel):
@@ -30,7 +30,22 @@ class RelationshipList(BaseModel):
relationships: list[Relationship] = Field(default_factory=list)
success: bool
+class DescribesIncident(BaseModel):
+ describes_incident: bool = Field(description="does the include malware analysis, APT group reports, data breaches and vulnerabilities?")
+ explanation: str = Field(description="Two or three sentence summary of the incidents it describes OR summary of what it describes instead of an incident")
+ incident_classification : str = Field(description="One of valid incident classifications that best describes this document/report")
+class AttackFlowItem(BaseModel):
+ position : int = Field(description="order of object starting at 0")
+ attack_tactic_id : str
+ attack_technique_id : str
+ name: str
+ description: str
+
+class AttackFlowList(BaseModel):
+ matrix : str = Field(description="one of ics, mobile and enterprise")
+ items : list[AttackFlowItem]
+ success: bool = Field(description="determines if there's any valid flow in ")
class ParserWithLogging(PydanticOutputParser):
def parse(self, text: str):
diff --git a/txt2stix/attack_flow.py b/txt2stix/attack_flow.py
new file mode 100644
index 0000000..e797d47
--- /dev/null
+++ b/txt2stix/attack_flow.py
@@ -0,0 +1,78 @@
+import uuid
+from stix2 import Relationship
+
+from txt2stix.retriever import STIXObjectRetriever
+from stix2extensions.attack_action import AttackAction, AttackFlow
+from stix2extensions._extensions import attack_flow_ExtensionDefinitionSMO
+
+def parse_flow(report, flow):
+ attack_objects = STIXObjectRetriever().get_attack_objects(
+ flow.matrix,
+ [item.attack_tactic_id for item in flow.items]
+ + [item.attack_technique_id for item in flow.items],
+ )
+ attack_objects = {
+ obj["external_references"][0]["external_id"]: obj for obj in attack_objects
+ }
+ flow_objects = [report, attack_flow_ExtensionDefinitionSMO]
+ last_action = None
+ for i, item in enumerate(flow.items):
+ try:
+ tactic_obj = attack_objects[item.attack_tactic_id]
+ technique_obj = attack_objects[item.attack_tactic_id]
+ action_obj = AttackAction(
+ **{
+ "id": f"attack-action--{str(uuid.uuid4())}",
+ "effect_refs": [f"attack-action--{str(uuid.uuid4())}"],
+ "technique_id": item.attack_technique_id,
+ "technique_ref": tactic_obj["id"],
+ "tactic_id": item.attack_technique_id,
+ "tactic_ref": technique_obj["id"],
+ "name": item.name,
+ "description": item.description,
+ },
+ allow_custom=True,
+ )
+ action_obj.effect_refs.clear()
+ flow_objects.append(tactic_obj)
+ flow_objects.append(technique_obj)
+ if i == 0:
+ flow_obj = {
+ "type": "attack-flow",
+ "id": report.id.replace("report", "attack-flow"),
+ "spec_version": "2.1",
+ "created": report.created,
+ "modified": report.modified,
+ "created_by_ref": report.created_by_ref,
+ "start_refs": [action_obj["id"]],
+ "name": report.name,
+ "description": report.description,
+ "scope": "malware",
+ "external_references": report.external_references,
+ "object_marking_refs": report.object_marking_refs,
+ }
+ flow_objects.append(AttackFlow(**flow_obj))
+ flow_objects.append(
+ Relationship(
+ type="relationship",
+ spec_version="2.1",
+ # id="relationship--",
+ created_by_ref=report.created_by_ref,
+ created=report.created,
+ modified=report.modified,
+ relationship_type="attack-flow",
+ description=f"Attack Flow for {report.name}",
+ source_ref=report.id,
+ target_ref=flow_obj['id'],
+ external_references=report.external_references,
+ object_marking_refs=report.object_marking_refs,
+ )
+ )
+ else:
+ last_action["effect_refs"].append(action_obj["id"])
+ flow_objects.append(action_obj)
+ last_action = action_obj
+ except:
+ pass
+
+ return flow_objects
diff --git a/txt2stix/retriever.py b/txt2stix/retriever.py
index 39fa09f..dc68770 100644
--- a/txt2stix/retriever.py
+++ b/txt2stix/retriever.py
@@ -17,10 +17,14 @@ def __init__(self, host="ctibutler") -> None:
else:
raise NotImplementedError("The type `%s` is not supported", host)
- def get_attack_objects(self, matrix, attack_id):
+ def get_attack_object(self, matrix, attack_id):
endpoint = urljoin(self.api_root, f"v1/attack-{matrix}/objects/{attack_id}/")
return self._retrieve_objects(endpoint)
+ def get_attack_objects(self, matrix, attack_ids):
+ endpoint = urljoin(self.api_root, f"v1/attack-{matrix}/objects/?attack_id={','.join(attack_ids)}")
+ return self._retrieve_objects(endpoint)
+
def get_objects_by_id(self, id, type):
return self._retrieve_objects(urljoin(self.api_root, f"v1/{type}/objects/{id}/"))
@@ -63,11 +67,11 @@ def retrieve_stix_objects(stix_mapping: str, id, host=None):
match stix_mapping:
### ATT&CK by ID
case 'mitre-attack-ics-id':
- return retreiver.get_attack_objects('ics', id)
+ return retreiver.get_attack_object('ics', id)
case 'mitre-attack-mobile-id':
- return retreiver.get_attack_objects('mobile', id)
+ return retreiver.get_attack_object('mobile', id)
case 'mitre-attack-enterprise-id':
- return retreiver.get_attack_objects('enterprise', id)
+ return retreiver.get_attack_object('enterprise', id)
### Others by ID
case "mitre-capec-id":
diff --git a/txt2stix/stix.py b/txt2stix/stix.py
index e80f52c..8f3df00 100644
--- a/txt2stix/stix.py
+++ b/txt2stix/stix.py
@@ -152,6 +152,7 @@ class txt2stixBundler:
uuid = None
id_map = dict()
id_value_map = dict()
+ _flow_objects = []
# this identity is https://raw.githubusercontent.com/muchdogesec/stix4doge/main/objects/identity/txt2stix.json
default_identity = Identity(
type="identity",
@@ -415,3 +416,15 @@ def indicator_id_from_value(value, stix_mapping):
return "indicator--" + str(
uuid.uuid5(UUID_NAMESPACE, f"txt2stix+{stix_mapping}+{value}")
)
+
+ @property
+ def flow_objects(self):
+ return self._flow_objects
+
+ @flow_objects.setter
+ def flow_objects(self, objects):
+ for obj in objects:
+ if obj['id'] == self.report.id:
+ continue
+ self.add_ref(obj)
+ self._flow_objects = objects
\ No newline at end of file
diff --git a/txt2stix/txt2stix.py b/txt2stix/txt2stix.py
index eee73cd..b40e8d3 100644
--- a/txt2stix/txt2stix.py
+++ b/txt2stix/txt2stix.py
@@ -10,6 +10,9 @@
from pydantic import BaseModel
+from txt2stix.attack_flow import parse_flow
+
+
from .utils import remove_links
from .common import UUID_NAMESPACE, FatalException
@@ -20,6 +23,8 @@
import functools
from fnmatch import filter
from .ai_extractor import ALL_AI_EXTRACTORS, BaseAIExtractor, ModelError
+from stix2.serialization import serialize as stix2_serialize
+from stix2 import Bundle
import json, logging
@@ -130,6 +135,12 @@ def parse_args():
parser = argparse.ArgumentParser(description="File Conversion Tool")
inf_arg = parser.add_argument("--input_file", "--input-file", required=True, help="The file to be converted. Must be .txt", type=Path)
+ parser.add_argument("--ai_check_content", required=False, type=parse_model, help="Use an AI model to check wether the content of the file contains threat intelligence. Paticularly useful to weed out vendor marketing.")
+ if (args := parser.parse_known_args()[0]) and args.ai_check_content:
+ model : BaseAIExtractor = args.ai_check_content
+ value = model.check_content(args.input_file.read_text())
+ print("check-content output:", value.model_dump_json())
+ exit(0)
name_arg = parser.add_argument("--name", required=True, help="Name of the file, max 124 chars", default="stix-out")
parser.add_argument("--created", required=False, default=datetime.now(), help="Allow user to optionally pass --created time in input, which will hardcode the time used in created times")
parser.add_argument("--ai_settings_extractions", required=False, type=parse_model, help="(required if AI extraction enabled): passed in format provider:model e.g. openai:gpt4o. Can pass more than one value to get extractions from multiple providers.", metavar="provider[:model]", nargs='+', default=[parse_model('openai')])
@@ -145,6 +156,7 @@ def parse_args():
parser.add_argument('--ignore_image_refs', default=True, type=parse_bool)
parser.add_argument('--ignore_link_refs', default=True, type=parse_bool)
parser.add_argument("--ignore_extraction_boundary", default=False, type=parse_bool, help="default if not passed is `false`, but if set to `true` will ignore boundary capture logic for extractions")
+ parser.add_argument('--ai_create_attack_flow', default=False, action='store_true', help="create attack flow for attack objects in report/bundle")
args = parser.parse_args()
if not args.input_file.exists():
@@ -155,9 +167,11 @@ def parse_args():
if args.relationship_mode == 'ai' and not args.ai_settings_relationships:
parser.error("relationship_mode is set to AI, --ai_settings_relationships is required")
+ if args.ai_create_attack_flow and not args.ai_settings_relationships:
+ parser.error("--ai_create_attack_flow requires --ai_settings_relationships")
#### process --use-extractions
if args.use_extractions.get('ai') and not args.ai_settings_extractions:
- parser.error("ai based extractors are passed, --ai_settings_relationships is required")
+ parser.error("ai based extractors are passed, --ai_settings_extractions is required")
args.all_extractors = all_extractors
return args
@@ -246,11 +260,13 @@ def main():
job_id = args.report_id or str(uuid.uuid4())
setLogFile(logger, Path(f"logs/logs-{job_id}.log"))
logger.info(f"Arguments: {json.dumps(sys.argv[1:])}")
+
input_text = args.input_file.read_text()
preprocessed_text = remove_links(input_text, args.ignore_image_refs, args.ignore_link_refs)
load_env()
+
bundler = txt2stixBundler(args.name, args.use_identity, args.tlp_level, input_text, args.confidence, args.all_extractors, args.labels, created=args.created, report_id=args.report_id, external_references=args.external_refs)
log_notes(sys.argv, "Config")
convo_str = None
@@ -267,6 +283,13 @@ def main():
extracted_relationships = extract_relationships_with_ai(bundler, preprocessed_text, all_extracts, args.ai_settings_relationships)
# convo_str = ai_extractor_session.get_conversation() if ai_extractor_session and ai_extractor_session.initialized else ""
+ flow = None
+ if args.ai_create_attack_flow:
+ logging.info("creating attack-flow bundle")
+ ex: BaseAIExtractor = args.ai_settings_relationships
+ flow = ex.extract_attack_flow(input_text, all_extracts, extracted_relationships)
+ bundler.flow_objects = parse_flow(bundler.report, flow)
+
out = bundler.to_json()
@@ -276,11 +299,17 @@ def main():
logger.info(f"Wrote bundle output to `{output_path}`")
data = {
"extractions": all_extracts,
- "relationships": extracted_relationships
+ "relationships": extracted_relationships,
+ "attack-flow": flow.model_dump(),
}
data_path = Path(str(output_path).replace('bundle--', 'data--'))
data_path.write_text(json.dumps(data, indent=4))
logger.info(f"Wrote data output to `{data_path}`")
+ if flow:
+ flow_path = Path(str(output_path).replace('bundle--', 'attack-flow-bundle--'))
+ flow_bundle = Bundle(objects=bundler.flow_objects, allow_custom=True)
+ flow_path.write_text(stix2_serialize(flow_bundle, indent=4))
+ logger.info(f"Wrote attack-flow bundle to `{flow_path}`")
except argparse.ArgumentError as e:
logger.exception(e, exc_info=True)
except:
diff --git a/txt2stix/utils.py b/txt2stix/utils.py
index d0abbfe..c089971 100644
--- a/txt2stix/utils.py
+++ b/txt2stix/utils.py
@@ -71,6 +71,7 @@ def validate_file_mimetype(file_name):
_, ext = os.path.splitext(file_name)
return FILE_EXTENSIONS.get(ext)
+
TLDs = [tld.lower() for tld in read_included_file('helpers/tlds.txt').splitlines()]
REGISTRY_PREFIXES = [key.lower() for key in read_included_file('helpers/windows_registry_key_prefix.txt').splitlines()]
FILE_EXTENSIONS = dict(line.lower().split(',') for line in read_included_file('helpers/mimetype_filename_extension_list.csv').splitlines())
\ No newline at end of file