UPDATE 2017-04-28
adding 39&
and 0&#
test case - no code change needed - wow, that was lucky :)
Important to note that Im deliberately not allowing ampersand to exist in the text to be searched except as the start of an escaped sequence, whether its a valid escape sequence or not ie im allowing &39;
to be an escape sequence even though technically invalid. This makes it easy to say that if we are asked to find a string with an ampersand which is not part of a complete escape sequence (ie like 0&#') then this should not be matched, and is an invalid search string. The use of
AmpExcape` below to do this, rather than terminate and return the unchanged string, is a convenience since other use-cases of this regex (ie outside JavaScript) do not allow me conditional branching statements (or function callbacks on matches for that matter). For my purpose, this comes closet to the definition Im working against for escaped HTML.
UPDATE 2017-04-26
adding &g
test case and editing answer/code for that case by escaping the find string
UPDATE 2017-04-25
A two pass regex solution:
function do_replace(test_str, find_str, replace_str) {
let escaped_find_str = AmpExcape(find_str);
escaped_find_str = RegExcape(escaped_find_str);
let escaped_replace_str = RegExcape(replace_str);
let first_regex = new RegExp('(' + escaped_find_str + ')|(&\\w*;|&#[0-9a-fA-F]*;)','gi');
let first_pass = test_str.replace(first_regex,'&&$1'+replace_str+'&&$2');
let second_regex = new RegExp('&&(?:'+escaped_replace_str+'&&|(' + escaped_find_str + ')?('+escaped_replace_str + ')?&&)','gi');
let second_pass = first_pass.replace(second_regex,'$2');
return second_pass;
}
I was looking for an approach that used regex only and whilst the solutions by @sln and @tomas-langkaas were useful (well, mind blowing), I still wanted a method that I could use in JS and beyond.
I have something thats working for my test cases but it is forced to do multiple passes over the text. Originally I had three passes but I think I have it working at 2 passes. Obviously not as efficient as the other answers.
Ive attempting to match: <target string> | <entity>
as those answers did and flipped the order to be entity first as @sln did in the latest update. But im using &&
as a 'new' escape sequence. The strategy is:
Initially we escape the find and replacement strings for their use in regex. for the <target string>
if there is an &
that is not part of a complte entity, then we replace it with a double &&
with the aim of preventing it being matched ever ie Im considering the search for a partial entity to never be possible in escaped HTML
the first pass replaces any matches (of either entities or target strings) with
&&<target group value><replacement string>&&<entity group value>
on a match of the target string the <entity group value>
will be blank, and we will return the &&
escape, the target string, the replacement string, and a final &&
escape
- on a match of an entity, the
<target group value>
will now be empty, so we end up returning &&<replacement str>&&
followed by the entity value.
- the second pass can look for all occurrences of
&&<target string><replacement string>&&
and replace with <replacement string>
- also in the second pass we can look for
&&<replacement string>&&
and know it is supposed to be replaced with blank.
- we dont have to do anything for entity matches this time as we have left them untouched in pass 1
Heres the complete code with test cases (attribution for RexExcape maintained from @tomas-langkaas):
// helper for building regexes from strings
// http://stackoverflow.com/a/3561711/6738706
function AmpExcape(str) {
return str.replace(/&(\w*;|#[0-9a-fA-F]*;)|(&)/g, '&$2$1');
}
function RegExcape(str) {
return str.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
};
function do_replace(test_str, find_str, replace_str) {
let escaped_find_str = AmpExcape(find_str);
escaped_find_str = RegExcape(escaped_find_str);
let escaped_replace_str = RegExcape(replace_str);
let first_regex = new RegExp('(' + escaped_find_str + ')|(&\\w*;|&#[0-9a-fA-F]*;)','gi');
let first_pass = test_str.replace(first_regex,'&&$1'+replace_str+'&&$2');
let second_regex = new RegExp('&&(?:'+escaped_replace_str+'&&|(' + escaped_find_str + ')?('+escaped_replace_str + ')?&&)','gi');
let second_pass = first_pass.replace(second_regex,'$2');
return second_pass;
}
let str = '39<img style='width: 39;' bor9;wder=39'0'&39; />39;';
let test_list = ['39','39;','9;','9;w','39&','0&#'];
run_test(str,test_list);
str = '<img border='0' /$gt;';
test_list = ['lt','39',''','border=''];
run_test(str,test_list);
str = 'test string ring ring';
test_list = ['ring'];
run_test(str,test_list);
str = '39<img style='width: 39;' border='0'&39; />39;';
test_list = ['border','0',''','39','lt','<','border='','<img','&g','t;'];
run_test(str,test_list);
function run_test(base_str, find_list) {
let orig_str = 'original';
let max_len = find_list.concat(orig_str).reduce(function(a,b) {
return a > b.length ? a : b.length;
},0);
console.log();
console.log(pad(orig_str,max_len) + ': ' + str);
find_list.map(function(gstr) {
console.log( pad( gstr, max_len) + ': ' + do_replace(str, gstr, '|' + gstr + '|'));
});
}
function pad(str,len) {
while ( str.length < len) { str = str + ' ' };
return str;
}
and the output
original: 39<img style='width: 39;' bor9;wder=39'0'&39; />39;
39 : |39|<img style='width: |39|;' bor9;wder=|39|'0'&39; />|39|;
39; : 39<img style='width: |39;|' bor9;wder=39'0'&39; />|39;|
9; : 39<img style='width: 3|9;|' bor|9;|wder=39'0'&39; />3|9;|
9;w : 39<img style='width: 39;' bor|9;w|der=39'0'&39; />39;
39& : 39<img style='width: 39;' bor9;wder=39'0'&39; />39;
0&# : 39<img style='width: 39;' bor9;wder=39'0'&39; />39;
original : <img border='0' /$gt;
lt : <img border='0' /$gt;
39 : <img border='0' /$gt;
' : <img border=|'|0|'| /$gt;
border=': <img |border='|0' /$gt;
original: test string ring ring
ring : test st|ring| |ring| |ring|
original : 39<img style='width: 39;' border='0'&39; />39;
border : 39<img style='width: 39;' |border|='0'&39; />39;
0 : 39<img style='width: 39;' border='|0|'&39; />39;
' : 39<img style=|'|width: 39;|'| border=|'|0|'|&39; />39;
39 : |39|<img style='width: |39|;' border='0'&39; />|39|;
lt : 39<img style='width: 39;' border='0'&39; />39;
< : 39|<|img style='width: 39;' border='0'&39; />39;
border=': 39<img style='width: 39;' |border='|0'&39; />39;
<img : 39|<img| style='width: 39;' border='0'&39; />39;
&g : 39<img style='width: 39;' border='0'&39; />39;
t; : 39<img style='width: 39;' border='0'&39; />39;
UPDATE 2017-04-24
I abandoned the approach below for a couple of reasons but most importantly it will only find the first occurrence of the find string in a run of text that has no ampersands or semi-colons. For instance the string test string ring ring
will only be matched as test st|ring| ring ring
if the match is for ring
- this seems pretty useless for a a find and replace - Im updating the answer so it works for matching ring
at least the first time as I previously missed the line start as an allowed terminal character but i dont consider this a valid solution for all possible texts.
ORIGINAL ANSWER (with modifications)
If you care about semi-colon appearing in the text other than as a terminal character to a corresponding &
, like for inline styles where it might say style="width: 39;"
, then you need something a bit complicated:
'((?:^|(?!(?:[^&;]+' + str + ')))(?=(?:(?:&|;|^)[^&;]*))(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))?)(' + str + ')'
The code presented further down will output the original test cases:
"original": <img border='0' /$gt;
"lt": <img border='0' /$gt;
"39": <img border='0' /$gt;
"'": <img border=|'|0|'| /$gt;
"border='": <img |border='|0' /$gt;
It also shows the specific output against semi-colons appearing in the text and search terms.
"original": 39<img style='width: 39;' bor;der=39'0'&39; />39;
test string that may be followed by semi-colon :
|39|<img style='width: |39|;' bor;der=|39|'0'&39; />|39|;
test match with semi-colon:
39<img style='width: |39;|' bor;der=39'0'&39; />|39;|
test match with semi-colon mid string
39<img style='width: 39;' |bor;der|=39'0'&39; />39;
NOTE And this is the example of where the approach falls apart:
"original": test string ring ring
test st|ring| ring ring
There is no lookbehind that would allow a 'memory' effect to differentiate between zero characters after a previous match and zero characters, but within an escaped sequence. So it would be impossible to match the second occurrence of ring
without also matching 39
in the string '
Heres the example code:
function make_regex(str) {
let regexp = new RegExp('((?:^|(?!(?:[^&;]+' + str + ')))(?=(?:(?:&|;|^)[^&;]*))(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))?)(' + str + ')','gi');
return regexp;
}
function do_replace(test_str, find_str, replace_str) {
let new_reg = make_regex(find_str);
return test_str.replace(new_reg,'$1' + replace_str);
}
let str = '39<img style='width: 39;' bor;der=39'0'&39; />39;';
console.log();
console.log('"original": ', str);
console.log('test string that may be followed by semi-colon :');
console.log(do_replace(str, '39', '|$2|' ));
console.log('test match with semi-colon:');
console.log(do_replace(str, '39;', '|39;|' ));
console.log('test match with semi-colon mid string');
console.log(do_replace(str, 'bor;der', '|bor;der|' ))
str = '<img border='0' /$gt;';
console.log();
console.log('"original": ', str);
console.log('"lt": ', do_replace(str, 'lt', '|lt|' ));
console.log('"39": ', do_replace(str, '39', '|39|' ));
console.log('"'": ', do_replace(str, ''', '|'|' ));
console.log('"border='":', do_replace(str, 'border='', '|border='|' ));
str = 'test string ring ring';
console.log();
console.log('"original": ', str);
console.log(do_replace(str, 'ring', '|$2|'));
Its important to note that the regex captures not only the text you want but the chunk of text before this. This affects your use of $1
as a replacement value since the text you want is now $2
The explanation of the terms is not straightforward but it can be broken down into:
A negative lookahead including string to find that prevents beginning the matching on non-terminal characters
(?:^|(?!(?:[^&;]+' + str + ')))
A positive lookahead that forces the matching to begin on a terminal character, or the start of the line
(?=(?:(?:&|;|^)[^&;]*))
A negative lookahead that prevents the matching from starting on an &
but allows the line start or a previous semi-colon
(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))?
And then finally the string to match
The effect is that we match the whole section of the text that contains the text we want, from the previous terminal value, to the end of the string to match. Because of this, javascript ends up with matches that are the full chunk that was matched. For instance, when we ask for border='
we will end up with the chunk ;img border='
. So we define two capture groups, one for the part of the chunk we are not interested in and one for our match. This allows us to use $1$2
to recreate the string or $1whatever
to replace only the part we want. We can then use str.replace()
with this strategy