I find your question an interesting challenge, so here is a code that do what you want, doing this using Regex
alone it's impossible because there is nested expression, this is a solution using a combination of Regex
and string manipulations to handle nested expressions:
# -*- coding: utf-8 -*-
import re
RE_IDENTIFIER = r'\b[a-z]\w*\b(?!\s*[\[\("\'])'
RE_INDEX_ONLY = re.compile(r'(##)(\d+)(##)')
RE_INDEX = re.compile('##\d+##')
def extract_expression(string):
""" extract all identifier and getitem expression in the given order."""
def remove_brackets(text):
# 1. handle `[...]` expression replace them with #{#...#}#
# so we don't confuse them with word[...]
pattern = '(?<!\w)(\s*)(\[)([^\[]+?)(\])'
# keep extracting expression until there is no expression
while re.search(pattern, text):
text = re.sub(pattern, r'\1#{#\3#}#', string)
return text
def get_ordered_subexp(exp):
""" get index of nested expression."""
index = int(exp.replace('#', ''))
subexp = RE_INDEX.findall(expressions[index])
if not subexp:
return exp
return exp + ''.join(get_ordered_subexp(i) for i in subexp)
def replace_expression(match):
""" save the expression in the list, replace it with special key and it's index in the list."""
match_exp = match.group(0)
current_index = len(expressions)
expressions.append(None) # just to make sure the expression is inserted before it's inner identifier
# if the expression contains identifier extract too.
if re.search(RE_IDENTIFIER, match_exp) and '[' in match_exp:
match_exp = re.sub(RE_IDENTIFIER, replace_expression, match_exp)
expressions[current_index] = match_exp
return '##{}##'.format(current_index)
def fix_expression(match):
""" replace the match by the corresponding expression using the index"""
return expressions[int(match.group(2))]
# result that will contains
expressions = []
string = remove_brackets(string)
# 2. extract all expression and keep track of there place in the original code
pattern = r'\w+\s*\[[^\[]+?\]|{}'.format(RE_IDENTIFIER)
# keep extracting expression until there is no expression
while re.search(pattern, string):
# every exression that is extracted is replaced by a special key
string = re.sub(pattern, replace_expression, string)
# some times inside brackets can contains getitem expression
# so when we extract that expression we handle the brackets
string = remove_brackets(string)
# 3. build the correct result with extracted expressions
result = [None] * len(expressions)
for index, exp in enumerate(expressions):
# keep replacing special keys with the correct expression
while RE_INDEX_ONLY.search(exp):
exp = RE_INDEX_ONLY.sub(fix_expression, exp)
# finally we don't forget about the brackets
result[index] = exp.replace('#{#', '[').replace('#}#', ']')
# 4. Order the index that where extracted
ordered_index = ''.join(get_ordered_subexp(exp) for exp in RE_INDEX.findall(string))
# convert it to integer
ordered_index = [int(index[1]) for index in RE_INDEX_ONLY.findall(ordered_index)]
# 5. fix the order of expressions using the ordered indexes
final_result = []
for exp_index in ordered_index:
final_result.append(result[exp_index])
# for debug:
# print('final string:', string)
# print('expression :', expressions)
# print('order_of_expresion: ', ordered_index)
return final_result
code = 'foo + bar[1] + baz[1:10:var1[2+1]] + qux[[1,2,int(var2)]] + bob[len("foobar")] + func() + func2 (var3[0])'
code2 = 'baz[1:10:var1[2+1]]'
code3 = 'baz[[1]:10:var1[2+1]:[var3[3+1*x]]]'
print(extract_expression(code))
print(extract_expression(code2))
print(extract_expression(code3))
OUTPU:
['foo', 'bar[1]', 'baz[1:10:var1[2+1]]', 'var1[2+1]', 'qux[[1,2,int(var2)]]', 'var2', 'bob[len("foobar")]', 'var3[0]']
['baz[1:10:var1[2+1]]', 'var1[2+1]']
['baz[[1]:10:var1[2+1]:[var3[3+1*x]]]', 'var1[2+1]', 'var3[3+1*x]', 'x']
I tested this code for very complicated examples and it worked perfectly. and notice that the order if extraction is the same as you wanted, Hope that this is what you needed.