BlackboxAI

This commit is contained in:
Maksym
2025-04-07 18:49:41 +02:00
parent 94dfac3055
commit b129f9bcae
8 changed files with 2422 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,144 @@
/**
* Blackbox Extension Prompt Template (VS Code) - Condensed
*/
// --- Common Context Interface ---
interface VscodeEditorContext {
selection?: string;
fullCode: string;
languageId: string;
prefix: string; // Code before cursor
suffix?: string; // Code after cursor
neighboringCode?: { above: string; below: string };
gitDiff?: string;
multipleFileContents?: { filePath: string; content: string }[];
chatHistory?: { user?: string; blackbox?: string }[];
}
// ==================================
// 1. Inline Code Editing/Generation (Ctrl+I)
// ==================================
const INLINE_EDIT_SYSTEM_PROMPT = `You are a coding assistant specializing in code completion and editing. Your task is to modify the selected code based on the prompt, considering the entire code file for context. Follow these guidelines:
- Generate the modified code that should replace the selected portion.
- Return ONLY the modified code snippet, without any markdown formatting, natural language explanations, or triple backticks.
- Ensure the modified code integrates seamlessly with the rest of the file.
- Maintain consistent style, indentation, and naming conventions with the existing code.
- Strictly answer with code only`;
function createInlineEditUserPrompt(prompt: string, context: VscodeEditorContext): string {
return `## Selected Code
[START SELECTION]
${context.selection || ""}
[END SELECTION]
## Entire Code File
[START ENTIRE FILE]
${context.fullCode}
[END FILE]
Generate the modified code that should replace the selected portion. If there is no selection, generate code that should be inserted at the cursor position. Strictly answer with code only:
Prompt: ${prompt}`;
}
/*
Conceptual API Call Structure:
[
{ role: "system", content: INLINE_EDIT_SYSTEM_PROMPT },
{ role: "user", content: createInlineEditUserPrompt(userInstruction, context) }
]
*/
// ============================
// 2. Code Completion (Typing Pause)
// ============================
// Note: Actual prompt structure is internal to the Blackbox API.
function createCodeCompletionInput(context: VscodeEditorContext, userId: string, premiumStatus: boolean, autocompleteVersion: 'quality' | 'speed'): any {
return {
userId: userId,
languageId: context.languageId,
prompt: context.prefix,
contextAbove: context.neighboringCode?.above,
contextBelow: context.neighboringCode?.below,
source: "visual studio",
premiumStatus: premiumStatus,
autocompleteVersion: autocompleteVersion,
};
}
// ============================
// 3. Code Search (// ? Query)
// ============================
// Note: Actual prompt structure is internal to the Blackbox API.
function createCodeSearchInput(query: string, userId: string): any {
return {
userId: userId,
textInput: query,
source: "visual studio",
};
}
// ============================
// 4. Blackbox AI Chat (Side Panel / Commands)
// ============================
// Note: Uses a webview; prompts are handled by the webview's backend.
// Context is passed from the extension to the webview.
interface ChatMessage { user?: string; blackbox?: string; }
interface ChatPromptInput { // Structure passed *to* webview or used by its backend
userMessage: string;
context?: VscodeEditorContext;
chatHistory: ChatMessage[];
commandTrigger?: string; // e.g., 'explain_code', 'comment_code'
workspaceId?: string;
}
// --- Example User Prompts Sent to Chat ---
const explainCodePrompt = (code: string, languageId: string) => `\`\`\`${languageId}\n${code}\n\`\`\`\n\nExplain this code`;
const improveCodePrompt = (code: string, languageId: string) => `\`\`\`${languageId}\n${code}\n\`\`\`\n\nRewrite this code better`;
const suggestCodePrompt = (codeAbove: string, languageId: string) => `\`\`\`${languageId}\n${codeAbove}\n\`\`\`\n\ngive 1 suggestion to continue this code. give code only.`;
const commentCodeInstruction = `give me this code with proper commenting. comments should clear consice. stay focused, this is very important for my career.`; // Code provided as context
// ==================================
// 5. Commit Message Generation (SCM Integration)
// ==================================
function createCommitMessageInput(context: VscodeEditorContext, userId: string): any {
return {
userId: userId,
diff: context.gitDiff,
source: "visual studio" // or 'source control'
};
}
// ============================
// 6. README Generation (Command)
// ============================
function createReadmeInput(context: VscodeEditorContext, userId: string): any {
const allFilesString = context.multipleFileContents
?.map(file => `File: ${file.filePath}\n\n${file.content}`)
.join('\n\n---\n\n');
return {
userId: userId,
allFiles: allFilesString,
};
}
// ============================
// 7. Code Review / Editor Chat (Older Command)
// ============================
function createEditorChatInput(context: VscodeEditorContext): any {
let userContentWithLine = "";
context.fullCode.split("\n").forEach((line, index) => {
userContentWithLine += `${index + 1}: ${line}\n`;
});
return {
language: context.languageId,
code: userContentWithLine
};
}

1
Blackbox.ai/README.md Normal file
View File

@@ -0,0 +1 @@
extractec from ~/.vscode/extensions/blackboxapp.blackboxagent-3.1.36/dist using extraction.py

View File

@@ -0,0 +1,76 @@
import re
import os
def extract_prompt_templates(filepath):
"""
Extracts potential prompt templates (primarily multi-line template literals)
from a JavaScript/TypeScript file.
Args:
filepath (str): The path to the .js or .ts file.
Returns:
list: A list of potential prompt template strings.
"""
if not os.path.exists(filepath):
print(f"Error: File not found at {filepath}")
return []
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"Error reading file {filepath}: {e}")
return []
# Regex to find template literals (strings enclosed in backticks `` ` ``)
# It handles escaped backticks (\\`) and embedded expressions (${...}) within the literal.
# It tries its best but might need refinement based on complex nested cases.
# Using re.DOTALL so '.' matches newline characters as well.
prompt_template_regex = r'`((?:\\`|[^`])*)`' # Simplified but effective for most cases
# More robust regex handling potential nested structures (might be slower)
# prompt_template_regex = r'`(?:[^`\\]*(?:\\.[^`\\]*)*)*`'
# Alternative focusing on structure (less likely if minified)
# assignment_regex = r'(?:const|let|var)\s+([\w\$]+)\s*=\s*(`(?:\\`|[^`])*`);'
found_templates = []
matches = re.findall(prompt_template_regex, content, re.DOTALL)
print(f"Found {len(matches)} potential template literals.")
for match_content in matches:
# The regex group captures the content *inside* the backticks
template = match_content.strip()
# Basic filtering: Keep templates that are multi-line, contain XML-like tags,
# or are reasonably long, as these are more likely to be actual prompts.
if '\n' in template or ('<' in template and '>' in template) or len(template) > 100:
# Optional: Remove common JS/TS code patterns if they are mistakenly captured
# (e.g., if a template literal *only* contains CSS or HTML)
# This requires more sophisticated filtering. For now, we keep most long/complex ones.
found_templates.append(template)
return found_templates
# --- Main Execution ---
if __name__ == "__main__":
# IMPORTANT: Replace this with the actual path to your extension.js file
file_to_analyze = "extension.js"
# Or provide the full path:
# file_to_analyze = "/path/to/your/project/extension.js"
print(f"Analyzing file: {file_to_analyze}")
templates = extract_prompt_templates(file_to_analyze)
if templates:
print(f"\n--- Extracted {len(templates)} Potential Prompt Templates ---")
for i, template in enumerate(templates):
print(f"\n--- Template {i+1} ---")
print(template)
print("--------------------")
else:
print("\nNo likely prompt templates (long/multi-line/tagged template literals) found.")

View File

@@ -0,0 +1,77 @@
import re
import os
def extract_prompt_templates(filepath, output_filepath="extracted_prompts.txt"):
"""
Extracts potential prompt templates (primarily multi-line template literals)
from a JavaScript/TypeScript file and saves them to an output file.
Args:
filepath (str): The path to the .js or .ts file.
output_filepath (str): The path where the extracted templates will be saved.
Returns:
int: The number of potential templates saved to the file, or -1 on error.
"""
if not os.path.exists(filepath):
print(f"Error: Input file not found at {filepath}")
return -1
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"Error reading input file {filepath}: {e}")
return -1
# Regex to find template literals (strings enclosed in backticks `` ` ``)
# Handles escaped backticks (\\`) and embedded expressions (${...})
prompt_template_regex = r'`((?:\\`|[^`])*)`'
found_templates = []
templates_saved_count = 0
try:
matches = re.findall(prompt_template_regex, content, re.DOTALL)
print(f"Found {len(matches)} potential template literals in the source.")
with open(output_filepath, 'w', encoding='utf-8') as outfile:
outfile.write(f"--- Extracted Potential Prompt Templates from: {filepath} ---\n\n")
for i, match_content in enumerate(matches):
template = match_content.strip()
# Basic filtering (multi-line, contains tags, or reasonably long)
if '\n' in template or ('<' in template and '>' in template) or len(template) > 100:
outfile.write(f"--- Template {templates_saved_count + 1} ---\n")
outfile.write(template)
outfile.write("\n\n--------------------\n\n")
templates_saved_count += 1
print(f"Successfully saved {templates_saved_count} potential templates to: {output_filepath}")
return templates_saved_count
except Exception as e:
print(f"An error occurred during extraction or writing to file: {e}")
return -1
# --- Main Execution ---
if __name__ == "__main__":
# IMPORTANT: Replace this with the actual path to your extension.js file
file_to_analyze = "extension.js"
# Or provide the full path:
# file_to_analyze = "/path/to/your/project/extension.js"
# Define the output file name
output_file = "extracted_prompts.txt"
print(f"Analyzing file: {file_to_analyze}")
count = extract_prompt_templates(file_to_analyze, output_file)
if count > 0:
print(f"Extraction complete. Check the file '{output_file}' for results.")
elif count == 0:
print(f"\nNo likely prompt templates (long/multi-line/tagged template literals) found or saved to '{output_file}'.")
else:
print("Extraction failed due to an error.")

View File

@@ -0,0 +1,136 @@
import re
import os
def extract_prompt_templates(filepath, output_filepath="extracted_prompts.txt", min_length=200):
"""
Extracts potential prompt templates from a JS/TS file, attempting to filter
out non-prompt template literals (like HTML/CSS/JS code snippets).
Args:
filepath (str): Path to the .js or .ts file.
output_filepath (str): Path to save the extracted templates.
min_length (int): Minimum character length for a template to be considered.
Returns:
int: Number of potential templates saved, or -1 on error.
"""
if not os.path.exists(filepath):
print(f"Error: Input file not found at {filepath}")
return -1
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"Error reading input file {filepath}: {e}")
return -1
# Regex for template literals
template_literal_regex = r'`((?:\\`|[^`])*)`'
# Keywords strongly suggesting a prompt template
prompt_keywords = [
'You are BLACKBOXAI', 'TOOL USE', 'RULES', 'Parameters:', 'Usage:',
'SYSTEM INFORMATION', 'OBJECTIVE', 'CAPABILITIES', 'MCP SERVERS',
'current working directory', 'execute_command', 'read_file',
'create_file', 'edit_file', 'replace_in_file', 'browser_action',
'ask_followup_question', 'attempt_completion', 'search_code',
'search_files', 'list_files', 'tool_name', 'parameter1_name',
'brainstorm_plan'
# Add more specific keywords if needed
]
# Convert to lowercase for case-insensitive matching
prompt_keywords_lower = {kw.lower() for kw in prompt_keywords}
# Keywords/patterns strongly suggesting it's *not* a prompt (HTML/CSS/JS boilerplate)
noise_keywords = [
'<!DOCTYPE html>', '<html lang=', '<head>', '<body>', '<script', '<style',
'function(', '=> {', 'class extends', 'export class', 'import {', 'require(',
'window.addEventListener', 'document.querySelector', '.CodeMirror',
'acquireVsCodeApi', 'const vscode =', 'module.exports', 'props', 'state',
'React.', 'Vue.', 'angular.', 'getElementById', 'createElement',
'padding:', 'margin:', 'color:', 'background-color:', 'font-size:',
'display: flex', 'position: absolute', 'z-index:', 'border-radius:',
'webpack', 'eslint', 'JSON.stringify', 'JSON.parse', 'console.log',
'# sourceMappingURL=' # Common in minified JS
# Add more specific noise patterns if needed
]
noise_keywords_lower = {kw.lower() for kw in noise_keywords}
# Regex to find XML-like tool tags, e.g., <tool_name>
tool_tag_regex = re.compile(r'<\w+(_\w+)*>')
templates_saved_count = 0
total_literals_found = 0
try:
matches = re.findall(template_literal_regex, content, re.DOTALL)
total_literals_found = len(matches)
print(f"Found {total_literals_found} total template literals in the source.")
with open(output_filepath, 'w', encoding='utf-8') as outfile:
outfile.write(f"--- Extracted Potential Prompt Templates from: {filepath} ---\n")
outfile.write(f"--- (Filtered from {total_literals_found} total template literals found) ---\n\n")
for i, match_content in enumerate(matches):
template = match_content.strip()
template_lower = template.lower()
is_potential_prompt = False
# --- Filtering Logic ---
if len(template) < min_length:
continue # Too short
# Check for strong positive indicators
has_prompt_keyword = any(kw in template_lower for kw in prompt_keywords_lower)
has_tool_tag = bool(tool_tag_regex.search(template))
# Check for strong negative indicators
has_noise_keyword = any(kw in template_lower for kw in noise_keywords_lower)
# More specific noise check (e.g., looks like pure HTML)
is_likely_html_css = template_lower.startswith(('<!doctype', '<html', '<style', 'body {', 'div {', '.','#')) and not has_prompt_keyword
# --- Decision ---
# Keep if it has prompt keywords or tool tags, AND is not clearly noise
if (has_prompt_keyword or has_tool_tag) and not has_noise_keyword and not is_likely_html_css:
is_potential_prompt = True
# Keep if it's very long and doesn't have strong noise indicators (might catch prompts without keywords)
elif len(template) > 1000 and not has_noise_keyword and not is_likely_html_css:
is_potential_prompt = True
if is_potential_prompt:
templates_saved_count += 1
outfile.write(f"--- Template {templates_saved_count} (Original Index: {i+1}) ---\n")
outfile.write(template)
outfile.write("\n\n--------------------\n\n")
# --- End Filtering Logic ---
print(f"Successfully saved {templates_saved_count} potential templates to: {output_filepath}")
return templates_saved_count
except Exception as e:
print(f"An error occurred during extraction or writing to file: {e}")
return -1
# --- Main Execution ---
if __name__ == "__main__":
# IMPORTANT: Replace this with the actual path to your extension.js file
file_to_analyze = "extension.js"
# Or provide the full path:
# file_to_analyze = "/path/to/your/project/extension.js"
output_file = "extracted_prompts_filtered.txt" # Changed output name
print(f"Analyzing file: {file_to_analyze}")
count = extract_prompt_templates(file_to_analyze, output_file)
if count > 0:
print(f"Extraction complete. Check the file '{output_file}' for results.")
elif count == 0:
print(f"\nNo likely prompt templates matching the criteria found or saved to '{output_file}'.")
print("Consider adjusting filtering keywords or min_length if prompts are missed.")
else:
print("Extraction failed due to an error.")

View File

@@ -0,0 +1,154 @@
import re
import os
def is_likely_code_or_markup(text, text_lower):
"""
Heuristically checks if a string is more likely code, HTML, or CSS
than a natural language prompt.
"""
# 1. Check for common code keywords/patterns (increase sensitivity)
code_keywords = [
'function(', '=> {', 'class ', 'constructor(', ' Symbol(', '.prototype',
'addEventListener', 'querySelector', 'getElementById', 'createElement',
'Object.assign', 'Object.defineProperty', 'Promise.resolve', 'Promise.reject',
'async (', 'await ', 'require(', 'import {', 'export default', 'module.exports',
'console.log', 'console.error', 'try {', '} catch (', ' for (', ' while (',
'arguments.length', 'this.', '.call(null', '.bind(this', '.map(', '.filter(', '.reduce(',
'.forEach(', '.test(', '.exec(', '.match(', '.replace(', '.split(', '.join(',
'JSON.stringify', 'JSON.parse', 'new Error(', 'throw new ', '# sourceMappingURL=',
'static {' # Added from example
]
if any(kw in text_lower for kw in code_keywords):
# Check ratio if a keyword is found, maybe it's just mentioned in a prompt
code_symbols = len(re.findall(r'[{}()\[\];=.,+\-*/&|!<>?:%]', text))
words = len(re.findall(r'\b\w+\b', text))
if words == 0 or code_symbols / (code_symbols + words) > 0.25: # High ratio of symbols
return True
# Low ratio might still be a prompt mentioning a keyword
# 2. Check for common HTML/CSS patterns
html_css_keywords = [
'<!DOCTYPE html>', '<html', '<head>', '<body', '<script', '<style',
'padding:', 'margin:', 'color:', 'background-color:', 'font-size:',
'display: flex', 'position: absolute', 'z-index:', 'border-radius:',
'.CodeMirror', 'w-button', 'w-form' # From examples
]
if any(kw in text_lower for kw in html_css_keywords):
return True # Pretty likely not a prompt
# Check for high density of HTML tags
html_tags = len(re.findall(r'<[/!]?\s*\w+', text))
if html_tags > 5 and html_tags / len(text.split()) > 0.1: # More than 1 tag per 10 words
return True
# Check for high density of CSS rules
css_rules = len(re.findall(r'[{};:]', text))
if css_rules > 10 and css_rules / len(text) > 0.05: # High density of CSS characters
# Check if it *also* lacks prompt keywords to be more sure
prompt_keywords_check = ['tool use', 'rules', 'parameters', 'usage', 'objective', '<tool_name>']
if not any(pk in text_lower for pk in prompt_keywords_check):
return True
# Check for the HTML entity list pattern (like Template 4)
html_entities = len(re.findall(r'&[#a-zA-Z0-9]+;', text))
if html_entities > 20 and html_entities / len(text) > 0.02: # High density of entities
return True
return False
def extract_prompt_templates(filepath, output_filepath="extracted_prompts_filtered_v2.txt", min_length=150):
"""
Extracts potential prompt templates, attempting to strongly filter out non-prompts.
"""
if not os.path.exists(filepath):
print(f"Error: Input file not found at {filepath}")
return -1
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"Error reading input file {filepath}: {e}")
return -1
template_literal_regex = r'`((?:\\`|[^`])*)`'
# Increased specificity for prompt keywords/structures
strong_prompt_keywords = [
'You are BLACKBOXAI', '====\nTOOL USE\n====', '====\nRULES\n====',
'====\nSYSTEM INFORMATION\n====', '====\nOBJECTIVE\n====',
'<execute_command>', '<read_file>', '<create_file>', '<edit_file>',
'<replace_in_file>', '<ask_followup_question>', '<attempt_completion>',
'brainstorm_plan' # Added from example
]
other_prompt_keywords = [
'Parameters:', 'Usage:', 'Description:', 'current working directory',
'search_code', 'search_files', 'list_files', 'browser_action',
'tool_name', 'parameter1_name', 'MCP SERVERS', 'CAPABILITIES'
]
strong_prompt_keywords_lower = {kw.lower() for kw in strong_prompt_keywords}
other_prompt_keywords_lower = {kw.lower() for kw in other_prompt_keywords}
templates_saved_count = 0
total_literals_found = 0
try:
matches = re.findall(template_literal_regex, content, re.DOTALL)
total_literals_found = len(matches)
print(f"Found {total_literals_found} total template literals in the source.")
with open(output_filepath, 'w', encoding='utf-8') as outfile:
outfile.write(f"--- Extracted Potential Prompt Templates from: {filepath} ---\n")
outfile.write(f"--- (Filtered from {total_literals_found} total template literals found) ---\n\n")
for i, match_content in enumerate(matches):
template = match_content.strip()
template_lower = template.lower()
is_potential_prompt = False
# --- Filtering Logic ---
if len(template) < min_length:
continue
# Check for strong positive indicators
has_strong_prompt_keyword = any(kw in template_lower for kw in strong_prompt_keywords_lower)
has_other_prompt_keywords_count = sum(1 for kw in other_prompt_keywords_lower if kw in template_lower)
# Check for strong negative indicators (more aggressively)
if is_likely_code_or_markup(template, template_lower):
continue # Skip if it looks like code/markup
# --- Decision ---
# Require at least one strong keyword OR multiple (e.g., 3+) other keywords
if has_strong_prompt_keyword or has_other_prompt_keywords_count >= 3:
is_potential_prompt = True
if is_potential_prompt:
templates_saved_count += 1
outfile.write(f"--- Template {templates_saved_count} (Original Index: {i+1}) ---\n")
outfile.write(template)
outfile.write("\n\n--------------------\n\n")
# --- End Filtering Logic ---
print(f"Successfully saved {templates_saved_count} potential templates to: {output_filepath}")
return templates_saved_count
except Exception as e:
print(f"An error occurred during extraction or writing to file: {e}")
return -1
# --- Main Execution ---
if __name__ == "__main__":
file_to_analyze = "extension.js"
output_file = "extracted_prompts_filtered_v2.txt" # New output name
print(f"Analyzing file: {file_to_analyze}")
count = extract_prompt_templates(file_to_analyze, output_file)
if count > 0:
print(f"Extraction complete. Check the file '{output_file}' for results.")
elif count == 0:
print(f"\nNo likely prompt templates matching the more stringent criteria found or saved to '{output_file}'.")
print("Consider adjusting filtering keywords or min_length if prompts are missed.")
else:
print("Extraction failed due to an error.")

View File

@@ -0,0 +1,206 @@
import re
import os
def is_likely_code_or_markup(text, text_lower):
"""
Heuristically checks if a string is more likely code, HTML, or CSS
than a natural language prompt. Adjusted to be less sensitive to syntax
if strong prompt indicators are present elsewhere.
"""
# Reduced list of very common code keywords that might appear in prompts
# but less likely to dominate unless it *is* code.
code_keywords = [
'function(', ' class ', ' constructor(', ' Symbol(', '.prototype', # Structure
'addEventListener', 'querySelector', 'getElementById', 'createElement', # DOM specific
'Object.assign', 'Object.defineProperty', 'Promise.resolve', 'Promise.reject', # Object/Promise
'module.exports', 'export default', 'import {', # Module system
'console.log', 'console.error', # Logging (less reliable on its own)
' try {', '} catch (', ' for (', ' while (', # Control flow
'.map(', '.filter(', '.reduce(', '.forEach(', # Array methods often in code blocks
'JSON.stringify', 'JSON.parse', 'new Error(', 'throw new ',
'# sourceMappingURL=' # Definite noise
]
# Keywords strongly suggesting JS/TS but could appear in prompts describing code
ambiguous_code_keywords = ['async (', 'await ', 'this.', '=> {']
code_keyword_count = sum(1 for kw in code_keywords if kw in text_lower)
ambiguous_code_keyword_count = sum(1 for kw in ambiguous_code_keywords if kw in text_lower)
# Check for common HTML/CSS patterns
html_css_keywords = [
'<!DOCTYPE html>', '<html', '<head>', '<body', '</script>', '</style>', # Closing tags added
'padding:', 'margin:', 'color:', 'background-color:', 'font-size:',
'display: flex', 'position: absolute', 'z-index:', 'border-radius:',
'.CodeMirror', 'w-button', 'w-form', '::placeholder', ':-ms-input-placeholder' # Added from examples
]
html_css_keyword_count = sum(1 for kw in html_css_keywords if kw in text_lower)
# Symbol/Tag Ratios
code_symbols = len(re.findall(r'[{}()\[\];=.,+\-*/&|!<>?:%]', text))
words = len(re.findall(r'\b\w+\b', text))
word_count = words if words > 0 else 1
symbol_ratio = code_symbols / (code_symbols + word_count)
html_tags = len(re.findall(r'<[/!]?\s*\w+', text))
html_tag_ratio = html_tags / word_count if word_count > 0 else 0
css_rules = len(re.findall(r'[{};:]', text))
css_char_ratio = css_rules / len(text) if len(text) > 0 else 0
html_entities = len(re.findall(r'&[#a-zA-Z0-9]+;', text))
entity_ratio = html_entities / len(text) if len(text) > 0 else 0
# --- Decision Logic for Noise ---
# Very high symbol ratio, few code words -> likely data/minified (like Template 607-609)
if symbol_ratio > 0.45 and code_keyword_count < 1 and ambiguous_code_keyword_count < 1:
return True
# Multiple specific code keywords + high symbol ratio suggests actual code block
if code_keyword_count >= 2 and symbol_ratio > 0.25:
return True
# Or several ambiguous ones + high symbols
if ambiguous_code_keyword_count >= 2 and symbol_ratio > 0.30:
return True
# Web/CSS keywords are strong indicators of noise
if html_css_keyword_count >= 2 or html_tag_ratio > 0.1:
return True
if css_char_ratio > 0.07:
return True
# High density of HTML entities (like Template 4)
if entity_ratio > 0.05 and html_entities > 15:
return True
return False # Otherwise, might be a prompt
def extract_prompt_templates(filepath, output_filepath="extracted_prompts_filtered_v4.txt", min_length=150):
"""
Extracts potential prompt templates, attempting to strongly filter out non-prompts.
Version 4: Fine-tuned noise detection and keyword priority.
"""
if not os.path.exists(filepath):
print(f"Error: Input file not found at {filepath}")
return -1
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"Error reading input file {filepath}: {e}")
return -1
template_literal_regex = r'`((?:\\`|[^`])*)`'
# Keywords indicating a high probability of being a prompt
very_strong_prompt_keywords = [
'you are blackboxai', # Case insensitive check below
'you are a helpful assistant',
]
# Structure markers are also very strong indicators
structure_markers = [
'====\nTOOL USE\n====', '====\nRULES\n====',
'====\nSYSTEM INFORMATION\n====', '====\nOBJECTIVE\n====',
'====\nCAPABILITIES\n====', '====\nMCP SERVERS\n====',
'--- START OF EXAMPLE ---', '--- END OF EXAMPLE ---'
]
# Specific tool tags
tool_tags = [
'<execute_command>', '<read_file>', '<create_file>', '<edit_file>',
'<replace_in_file>', '<ask_followup_question>', '<attempt_completion>',
'<brainstorm_plan>', '<search_code>', '<search_files>', '<list_files>',
'<browser_action>', '<use_mcp_tool>', '<access_mcp_resource>', '<tool_name>'
]
other_prompt_keywords = [
'parameters:', 'usage:', 'description:', 'current working directory',
'tool use formatting', 'tool use guidelines', '# tools', 'mcp servers are not always necessary'
]
very_strong_lower = {kw.lower() for kw in very_strong_prompt_keywords}
structure_lower = {kw.lower() for kw in structure_markers}
tool_tags_lower = {kw.lower() for kw in tool_tags}
other_lower = {kw.lower() for kw in other_prompt_keywords}
templates_saved_count = 0
total_literals_found = 0
try:
matches = re.findall(template_literal_regex, content, re.DOTALL)
total_literals_found = len(matches)
print(f"Found {total_literals_found} total template literals in the source.")
with open(output_filepath, 'w', encoding='utf-8') as outfile:
outfile.write(f"--- Extracted Potential Prompt Templates from: {filepath} ---\n")
outfile.write(f"--- (Filtered from {total_literals_found} total template literals found, v4 logic) ---\n\n")
for i, match_content in enumerate(matches):
template = match_content.strip()
template_lower = template.lower()
is_potential_prompt = False
# --- Filtering Logic ---
if len(template) < min_length:
continue
# 1. Check for VERY strong starting keywords first
# Use slicing for performance if templates are huge
prefix_lower = template_lower[:100] # Check first 100 chars
starts_with_very_strong = any(prefix_lower.startswith(kw) for kw in very_strong_lower)
# 2. If not starting strongly, check if it looks like noise
likely_noise = False
if not starts_with_very_strong:
likely_noise = is_likely_code_or_markup(template, template_lower)
if likely_noise:
continue
# 3. Check for other strong prompt indicators (structure, tools)
has_structure_marker = any(kw in template_lower for kw in structure_lower)
has_tool_tag = any(kw in template_lower for kw in tool_tags_lower)
has_other_prompt_keywords_count = sum(1 for kw in other_lower if kw in template_lower)
# --- Decision ---
# Keep if:
# - It starts with a very strong keyword
# - OR it has structure markers OR multiple tool tags (strong indicators)
# - OR it has at least one tool tag AND multiple other keywords
# - OR it has many (4+) other keywords (might be a prompt without tags)
if starts_with_very_strong:
is_potential_prompt = True
elif not likely_noise: # Only proceed if not flagged as noise
tool_tag_count = sum(1 for tag in tool_tags_lower if tag in template_lower)
if has_structure_marker or tool_tag_count >= 2:
is_potential_prompt = True
elif tool_tag_count >= 1 and has_other_prompt_keywords_count >= 2:
is_potential_prompt = True
elif has_other_prompt_keywords_count >= 4:
is_potential_prompt = True
if is_potential_prompt:
templates_saved_count += 1
outfile.write(f"--- Template {templates_saved_count} (Original Index: {i+1}) ---\n")
outfile.write(template)
outfile.write("\n\n--------------------\n\n")
# --- End Filtering Logic ---
print(f"Successfully saved {templates_saved_count} potential templates to: {output_filepath}")
return templates_saved_count
except Exception as e:
print(f"An error occurred during extraction or writing to file: {e}")
return -1
# --- Main Execution ---
if __name__ == "__main__":
file_to_analyze = "extension.js"
output_file = "extracted_prompts_filtered_v4.txt" # New output name
print(f"Analyzing file: {file_to_analyze}")
count = extract_prompt_templates(file_to_analyze, output_file)
if count > 0:
print(f"Extraction complete. Check the file '{output_file}' for results.")
elif count == 0:
print(f"\nNo likely prompt templates matching the v4 criteria found or saved to '{output_file}'.")
print("Consider adjusting filtering keywords or min_length if prompts are missed.")
else:
print("Extraction failed due to an error.")