14

I have some escaped HTML like this:

<img border='0' />

I'm trying to match and replace full escape sequences like ' but not partial, like 39, since 39 is not actually in the unescaped string. Essentially, each escape sequence should be treated like a single token.

This is a JS regex. Is there a way to exclude matches between & and ; while still accepting sequences that include both of those characters?

Desired results:

  • Search <img border='0' /> for lt: No match.
  • Search <img border='0' /> for 39: No match.
  • Search <img border='0' /> for ': Match.
  • Search <img border='0' /> for border=': Match.

Current code:

> var str = '<img border='0' />'
> str.replace(/(border)/gi, '|$1|')
'<img |border|='0' />'  // ok
> str.replace(/(39)/gi, '|$1|')
'<img border=&#0|39|;0&#0|39|; />'  // not ok

Note: I can't unescape and then re-escape to match. It has to be escaped.

nathancahill
  • 9,421
  • 8
  • 44
  • 87

10 Answers10

3

The OP wants a JavaScript regex to match and replace a string within escaped HTML while treating escape sequences (e.g. <, ', or ) as single characters, and not unescape the HTML string during the replacement process.

This means that replacing

  1. "lt" with "[lt]" in "< lt" would result in "< [lt]" (avoid match within entity)
  2. "<" with "[<]" in "< lt" would result in "[<] lt" (match entity)
  3. "&l" with "[&l]" in "< &lt" would result in "< [&l]t"(not match partial entity)
  4. "t;" with "[t;]" in "< lt;" would result in "< l[t;]" (not match partial entity)
  5. "< l" with "[< l]" in "< lt" would result in "[< l]t" (match including entity)
  6. "lt; &l" with "[lt; &l]" in "< &lt" would result in "< &lt" (not match partial entity)
  7. "t; <" with "[t; <]" in "lt; <" would result in "l[t; <]" (match including entity)
  8. "t; &lt" with "[t; &lt]" in "lt; <" would result in "lt; <" (not match partial entity)

With the following regex for capturing escaped sequences (e.g. <, ', or ),

/&[a-z]+;|&#x[a-f\d]+;|&#\d+;/gi

we may use the following function as a starting point that handles most of the cases above (#1, #2, #4, #5, and #7):

function searchAndReplace(searchFor, replacement, str) {
  return str.replace(
    new RegExp(
      prepare(searchFor) + 
      "|(&[a-z]+;|&#x[a-f\\d]+;|&#\\d+;)", // consume entities
      "gi"
    ),
    function(m, entity) {
      return entity || replacement;
    }
  );
}

function prepare(str) {
  return str.replace(/[^\w\s]/g, "\\$&"); //escape regex metachars [1]
}

// [1] from http://eloquentjavascript.net/09_regexp.html#h_Rhu25fogrG

The remaining cases (#3, #6, #8) involve a potential partial escaped sequence at the end of the search string.

A solution for this is to check the searchFor string for potential partial escaped sequences at the end and append a corresponding negated lookahead (?!) to prevent matching a valid escaped sequence. The full solution (passing a set of about 40 test cases) is shown below, and should be faster and less complex than an .exec() approach:

function searchAndReplace(searchFor, replacement, str) {
  return str.replace(
    new RegExp(
      prepare(searchFor) + 
      "|(&[a-z]+;|&#x[a-f0-9]+;|&#\\d+;)", 
      "gi"
    ),
    function(m, entity) {
      return entity || replacement;
    }
  );
}

function prepare(str) {
  var add = "";
  if (/&$/.test(str)) {
    add = "(?!#x[a-z\\d]+;|#\\d+;|[a-z]+;)";
  } else if (/&[a-z]+$/i.test(str)) {
    add = "(?![a-z]*;)";
  } else if (/&#$/.test(str)) {
    add = "(?!x[a-f\\d]+;|\\d+;)";
  } else if (/&#x$/.test(str)) {
    add = "(?![a-f\\d]+;)";
  } else if (/&#x[a-f\d]+$/i.test(str)) {
    add = "(?![a-f\\d]*;)";
  }
  return str.replace(/[^\w\s]/g, "\\$&") + add;
}

// test function

function test(searchFor, replacement, str, expected) {
  var result = searchAndReplace(searchFor, replacement, str);
  console.log(
    searchFor +
      ": " +
      (result === expected ? "Passed" : "Failed: " + [expected, result])
  );
}

// test cases

test("lt", "[lt]", "<img border='0' />", "<img border='0' />");
test("39", "[39]", "<img border='0' />", "<img border='0' />");
test("'", "[']", "<img border='0' />", "<img border=[']0['] />");
test("border='", "[border=']", "<img border='0' />", "<img [border=']0' />");
test("39&", "[39&]", "39<img border=39'&gt&gt&&#039 t; 0'&39; />", "39<img border=39'&gt&gt&&#039 t; 0'&39; />")
test("0&#", "[0&#]", "39<img border=39'&gt&gt&&#039 t; 0'&39; />", "39<img border=39'&gt&gt&&#039 t; 0'&39; />")
test("lt", "[]", "&lt<t;t&l", "&[]<t;t&l");
test("<", "[]", "&lt<t;t&l", "&lt[]t;t&l");
test("&l", "[]", "&lt<t;t&l", "[]t<t;t[]");
test("t;", "[]", "&lt<t;t&l", "&lt<[]t&l");
test("t&", "[]", "&lt<t;t&l", "&lt<t;[]l");
test("<t", "[]", "&lt<t;t&l", "&lt[];t&l");
test("t<", "[]", "&lt<t;t&l", "&l[]t;t&l");
test("t;t", "[]", "&lt<t;t&l", "&lt<[]&l");
test("t&l", "[]", "&lt<t;t&l", "&lt<t;[]");
test("39", "[]", "&#039'9;9&#", "&#0[]'9;9&#");
test("'", "[]", "&#039'9;9&#", "&#039[]9;9&#");
test("&", "[]", "&#039'9;9&#", "[]#039'9;9[]#");
test("&#", "[]", "&#039'9;9&#", "[]039'9;9[]");
test("9;", "[]", "&#039'9;9&#", "&#039'[]9&#");
test("9&", "[]", "&#039'9;9&#", "&#039'9;[]#");
test("'9", "[]", "&#039'9;9&#", "&#039[];9&#");
test("9'", "[]", "&#039'9;9&#", "&#03[]9;9&#");
test("9;9", "[]", "&#039'9;9&#", "&#039'[]&#");
test("9&#", "[]", "&#039'9;9&#", "&#039'9;[]");
test("x7", "[]", "߿f&#x", "&#[]ff;f&#x");
test("", "[]", "߿f&#x", "&#x7f[]f;f&#x");
test("&", "[]", "߿f&#x", "[]#x7ff;f[]#x");
test("&#", "[]", "߿f&#x", "[]x7ff;f[]x");
test("&#x", "[]", "߿f&#x", "[]7ff;f[]");
test("&#x7", "[]", "߿f&#x", "[]ff;f&#x");
test("f;", "[]", "߿f&#x", "&#x7f[]f&#x");
test("f&", "[]", "߿f&#x", "߿[]#x");
test("f", "[]", "߿f&#x", "&#x7f[];f&#x");
test("f", "[]", "߿f&#x", "&#x7[]f;f&#x");
test("f;f", "[]", "߿f&#x", "&#x7f[]&#x");
test("f&#", "[]", "߿f&#x", "߿[]x");
test("f&#x", "[]", "߿f&#x", "߿[]");
test("t; < lt &l", "[]", "< < lt <lt; < lt &lt", "< < lt <l[]t");
Tomas Langkaas
  • 3,795
  • 1
  • 14
  • 29
  • @sln, found a solution, switching back to the `search string | (entity)` pattern. – Tomas Langkaas Apr 29 '17 at 01:47
  • I posted one too. I'm going to have a look at yours though. Probably going to be later. Thanks. –  Apr 29 '17 at 22:23
  • @sln, great. I also made test cases you may use. – Tomas Langkaas Apr 29 '17 at 22:28
  • This is a really thorough list of test cases, and a great answer too. – nathancahill Apr 29 '17 at 23:40
  • @nathancahill, thanks, the question was more difficult than I first thought and requires quite a few tests to get right. – Tomas Langkaas Apr 30 '17 at 06:34
  • @TomasLangkaas - Ok, I've checked your latest answer. Some observations: In your comments, #3, #6, #8 are identical. This is because you changed your model to have the `search_for` first. `(searchFor)|(entity)`. Since it is first, you have capitalized on the fact that there are only 3 conditions: 1. The search string _without_ a partial at the end, 2 The search string _with_ a partial at the end. 3. The entity. To _guard_ against condition 2, you did right by adding assertions in that case. I was going to do that when I changed the model to `search_for` first, thought backtracking issues. –  May 01 '17 at 00:11
  • _Continued_ - Things I recommend changing: `& -> (?!#x[a-f\\d]+;|#\\d+;|[a-z]+;)` , ` -> (?!x[a-f\\d]+;|\\d+;)` , ` -> (?![a-f\\d]+;)` , `[a-z\d]+ => [a-f\d]+ -> (?![a-f\\d]*;)` Also, you might want to incorporate multiple searches in a single regex, like I did. It's not so hard and since you are using a string.replace via regex, the construction (replacement) is done at a low level without having to come out of solution ( ala _exec()_ ). Good job. –  May 01 '17 at 00:17
  • @sln, thanks. I started from scratch, so I also tested and discarded models where #3, #6 and #8 where not identical. Thanks for pointing out typos in the regexes where `[a-z\d]` was supposed to be `[a-f\d]` and recommending some changes, now updated. About combining the regexes into a single regex, I chose not to because I found it easier to read and maintain the code like this (including fixing the typos you pointed out). Feel free to make your own version which combines them. Thanks for all constructive comments and for transferring the bounty. – Tomas Langkaas May 01 '17 at 18:17
  • This is huge @TomasLangkaas! – nathancahill May 04 '17 at 01:38
1

One option here is to temporarily replace the string being searched for with a "dummy" string wherever it appears in an escaped character sequence prior to doing the actual replace. The "dummy" string will need to be something that is very unlikely to appear anywhere in the HTML. After the actual replace has been performed, a further replace can then be done to change the "dummy" string back to the string being searched for.

Below is a demo of this method in action, which produces the requested results. It uses this useful technique when a global replace is needed that doesn't use a regular expression and this useful technique for converting any string into a string literal for use in a regular expression (with any special characters escaped appropriately).

var html = "<img border='0' />"
replaceInHtml(html, 'lt', 'replacement');
replaceInHtml(html, '39', 'replacement');
replaceInHtml(html, ''', 'replacement');
replaceInHtml(html, 'border='', 'replacement');

function replaceInHtml(html, str, replacement) {
  // A unique string that is unlikely to appear in the HTML
  var dummyStr = '!*&$^£"^';

  var strInRegex = escapeRegExp(str);
  var dummyRegex = new RegExp('(&[#a-zA-Z0-9]*)'
      + strInRegex + '([#a-zA-Z0-9]*;)', 'g');

  var replaced = html.replace(dummyRegex, '$1' + dummyStr + '$2');
  replaced = replaced.split(str).join(replacement);
  replaced = replaced.split(dummyStr).join(str);
  console.log('Source:  ' + html
          + '\nReplace: ' + str
          + '\nWith:    ' + replacement
          + '\nGives:   ' + replaced);
}

function escapeRegExp(str) {
  return str.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
}
Community
  • 1
  • 1
Steve Chambers
  • 31,993
  • 15
  • 129
  • 173
  • I think this fully strips the escape sequences before searching and replacing right? I still want them in the final string, just not partially matched. – nathancahill Apr 20 '17 at 23:01
  • Sorry, had misunderstood the question and thought you just wanted to find out whether a match existed in the HTML rather than doing a global search and replace. Have now modified my answer to do the search and replace using a slightly different technique. – Steve Chambers Apr 21 '17 at 15:36
1

I started by matching everything what's between & and ;:

let str = "39<img border=39'0'&39; />39";
let search = '39';
let regexp = new RegExp('&[^&;]*?(' + search + ')[^&;]*?;', 'g'); // /&[^&;]*?(SEARCH)[^&;]*?;/g
let match = str.match(regexp);

console.log(match);

Then by implication, I would like to match everything what's not between these two characters:

const prepareRegexp = searchStr => new RegExp('(?:^&[^&;]*?)?('+searchStr+')(?!(?:[^&;]*?;))|(?:(?:^|;)(?:[^&;]*?)('+searchStr+'))', 'gm'); ///(?:^&[^&;]*?)?(SEARCH)(?!(?:[^&;]*?;))|(?:(?:^|;)(?:[^&;]*?)(SEARCH))/g

let find = (str, searchStr) => {
  let regexp = prepareRegexp(searchStr);
  let foundItemsArray;
  let allFoundItems = [];

  while ((foundItemsArray = regexp.exec(str)) !== null) {
    //foundItemsArray returns as follows:
    // [0] - full match
    // [1] - first capturing group
    // [2] - second capturing group
    // To get indexes of found strings you have to use: regexp.lastIndex
    // and take into account that second case, matches everything between the last ; or start of a line
    // and the searched string
    if (foundItemsArray[0] === searchStr) { //case for the first capturing group
      allFoundItems.push(foundItemsArray[0]); //0 or 1 it doesn't matter here as the matching group is the same as the capturing group
    } else { //case for the second capturing group
      allFoundItems.push(foundItemsArray[2]);
    }
  }
  
  return allFoundItems.length ? allFoundItems : null;
}

//Function 'find' refactored to avoid loop:
find = (str, searchStr) => {
  let regexp = prepareRegexp(searchStr);
  let allFoundItems = [];
  
  str.replace(prepareRegexp(searchStr), (match, p1, p2) => {
    if (p1) {
      allFoundItems.push(p1);
    } else {
      allFoundItems.push(p2);
    }
  });
  
  return allFoundItems;
}

//And function to replace the searched string:
const replace = (str, searchStr, replaceWith) =>
  str.replace(prepareRegexp(searchStr), (match, p1, p2) => {
    if (p1) {
      return replaceWith;
    } else {
      return match.replace(searchStr, replaceWith);
    }
  });

let str = "39<img border=39'0'&39; width: 50%; />39";
//console.log('Searching "39":', find(str, '39'));
console.log('Searching "'":', find(str, '''));
//Search <img border='0' width: 50%; /> for 50:
console.log('Searching "50":', find(str, '50'));

console.log('Replacing "39" with "|39|":', replace(str, '39', '|39|'));
console.log('Replacing "50" with "|50|":', replace(str, '50', '|50|'));

//Now test the string against given examples:
str = '<img border='0'';
//Search <img border='0' /> for lt: No match.
console.log('Searching "lt":', find(str, 'lt'));
//Search <img border='0' /> for 39: No match.
console.log('Searching "39":', find(str, '39'));
//Search <img border='0' /> for ': Match.
console.log('Searching "'":', find(str, '''));
console.log('Replacing "'" with "|'|":', replace(str, ''', '|'|'));
//Search <img border='0' /> for border=': Match.
console.log('Searching "border='":', find(str, 'border=''));
console.log('Replacing "border='" with "|border='|":', replace(str, 'border='', '|border='|'));
.as-console-wrapper {
  max-height: 100% !important;
}

And the breakdown of the regex: https://regex101.com/r/UCNnu1/2

//EDIT:

However that doesn't match the search string if it's followed by ;, so in order to capture such strings, we need to extend our regex to match another set of characters and use regexp.exec to catch only the interesting bits. The extended regex is:

https://regex101.com/r/UCNnu1/3

I updated the code to use the regexp for replace.

Oskar
  • 2,300
  • 19
  • 22
  • Yes, this looks exactly like what I need. Going to try to plug it in. – nathancahill Apr 20 '17 at 22:37
  • Note that if search contains metacharacters, you need to escape those. Try with `.*+` – Tushar Apr 22 '17 at 06:33
  • `Then by implication, I would like to match everything what's not between these two characters:` There is no analogy in regex. –  Apr 22 '17 at 23:49
  • Building off of the other point about no matches when the string contains a semicolon, this does not seem to be working: `console.log(find('<img style='height: 100%;' />', '100'))` – nathancahill Apr 24 '17 at 16:23
  • @nathancahill yes, you're right, I think I have an another way of replacing the value, however I can't find a single regex which could match what we search for. Here is the new regex: https://regex101.com/r/UCNnu1/3 and I'm going to update my answer if you agree that we want to just replace the string instead of matching values. – Oskar Apr 25 '17 at 13:42
  • @nathancahill Actually, I've found another way of matching strings, we can use https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/exec and grab strings only from capturing groups. My updated answer matches the searched strings even if they're in substring with `;` BTW. Can you find any other border or special cases to cover? – Oskar Apr 25 '17 at 14:14
  • Thanks @Oskar, I'll try to plug it in again. – nathancahill Apr 26 '17 at 19:09
1

RegEx should be used to check this but it cannot cover all possible entities and is not the best tool for this job. While below method will work for all HTML entities.

I'm trying to match and replace full escape sequences like &#039; but not partial, like 39, since 39 is not actually in the unescaped string.

Basically, you want to replace HTML entities by it's unscaped form. This is what below function is doing so, you don't need RegEx for that.

I'll use unescapeHTML function from this answer from Web_Designer

var escape = document.createElement('textarea');

function unescapeHTML(html) {
    escape.innerHTML = html;
    return escape.textContent;
}

This first create a new <textarea> element. Inside function, the string passed as argument is then assigned as innerHTML of this textarea and then returning textContent of it. This is the trick used to unescape HTML entities.

We can reuse this to determine if the string is valid HTML entity or not. If the function is able to unescape it, then it is valid HTML entity else it is not. This is what you want to determine.

var escape = document.createElement('textarea');

function unescapeHTML(html) {
  escape.innerHTML = html;
  return escape.textContent;
}

var str = '&lt;img border=&#039;0&#039; /&gt;';

console.log(unescapeHTML('lt') !== 'lt');
console.log(unescapeHTML('39') !== '39');
console.log(unescapeHTML('&#039;') !== '&#039;');
console.log(unescapeHTML('border=&#039;') !== 'border=&#039;');
Community
  • 1
  • 1
Tushar
  • 78,625
  • 15
  • 134
  • 154
  • 1
    `RegEx can be used to check this but it cannot cover all possible entities` In fact the [xml spec](http://www.w3.org/TR/1998/REC-xml-19980210#dt-entref) uses regex to define Character and Entity References. It's simply this `(?i)[%&](?:[a-z]+|(?:\#(?:[0-9]+|x[0-9a-f]+)));` –  Apr 23 '17 at 00:03
1

Is there a way to exclude matches between & and ; while still accepting sequences that include both of those characters? Essentially, each escape sequence should be treated like a single token.

In order to treat entities as separate tokens, we can build a regex that captures entities before any target substring, then use a callback function to return captured entities unmodified to the string.

An example, replacing "39" when it is not within an entity:

str.replace(
  /(&[a-z]+;|&#[0-9a-f]+;)|39/gi,
  function(m, entity){
    return entity || replacement;
  }
);

I'm trying to match and replace full escape sequences like &#039; but not partial, like 39

When replacing entities, such as &#039;, a different approach is required. The following working demo handles this and also builds regexes dynamically from provided search strings, handling all OP test cases:

function searchAndReplace(str, searchFor, replacement){
    return /^&([a-z]+|#[\da-f]+);/i.test(searchFor) ?
      // if searchFor equals or starts with an entity
      str.split(searchFor).join(replacement) :
      // else
      str.replace(
        new RegExp(
          '(&[a-z]+;|&#[0-9a-f]+;)|' + 
          searchFor.replace(/[^\w\s]/g, "\\$&"), //escape metachars
          'gi'
        ),
        function(m, entity){
          return entity || replacement;
        }
      );   
}

// test cases

console.log('Search for "border": \n' + searchAndReplace(
  '&lt;img border=&#039;0&#039; /&gt;', 
  'border', '{{border}}'
) + '\nmatch'); //matches

console.log('Search for "0": \n' + searchAndReplace(
  '&lt;img border=&#039;0&#039; /&gt;', 
  '0', '{{0}}'
) + '\nmatch'); //matches outside entities

console.log('Search for "&#039;": \n' + searchAndReplace(
  '&lt;img border=&#039;0&#039; /&gt;', 
  '&#039;', '{{&#039;}}'
) + '\nmatch'); //matches

console.log('Search for "39": \n' + searchAndReplace(
  '&lt;img border=&#039;0&#039; /&gt;', 
  '39', '{{39}}'
) + '\nno match'); //does not match

console.log('Search for "lt": \n' + searchAndReplace(
  '&lt;img border=&#039;0&#039; /&gt;', 
  'lt', '{{lt}}'
) + '\nno match'); //does not match

console.log('Search for "&lt;": \n' + searchAndReplace(
  '&lt;img border=&#039;0&#039; /&gt;', 
  '&lt;', '{{&lt;}}'
) + '\nmatch'); //matches

console.log('Search for "border=&#039;": \n' + searchAndReplace(
  '&lt;img border=&#039;0&#039; /&gt;', 
  'border=&#039;', '{{border=&#039;}}'
) + '\nmatch'); //matches

console.log('Search for "&lt;img": \n' + searchAndReplace(
  '&lt;img border=&#039;0&#039; /&gt;', 
  '&lt;img', '{{&lt;img}}'
) + '\nmatch'); //matches
Tomas Langkaas
  • 3,795
  • 1
  • 14
  • 29
  • The regex from string part is a really good idea, I was missing that. – nathancahill Apr 24 '17 at 16:05
  • Probably to cover all bases, the regex processes left to right, so what to find expression should be before entities expression `(string to find)|(entity)` –  Apr 25 '17 at 20:39
  • @sln, on the contrary, `(entity)|target string` is what meets OP's requirement of not matching target strings within entitites: Capturing entities before target strings is what prevents this. The only exception is when the target string is an entity, which is solved by specifically testing for this elsewhere. Thus, all bases are already covered. – Tomas Langkaas Apr 25 '17 at 21:17
  • @sln, I think you misunderstand the approach. The entity part of `(entity)|target string` is not for target strings that starts with entities, it captures all entities in general to prevent any match of target strings within an entity. I don't intend to fight anything, it is a pragmatic approach which solves the OP requirements with a few lines of code. – Tomas Langkaas Apr 25 '17 at 21:55
  • Found that the previous code did not handle the `<img` test case from [the answer from sln](http://stackoverflow.com/a/43553942/6738706), now included as a test case and code updated. – Tomas Langkaas Apr 25 '17 at 22:44
  • @TomasLangkaas - Ok, implemented the `((?=(entity)?)search)|(entity)` It works for all cases now. Primarily needed when the search string is a partial entity (from the beginning), i.e. `&` or `&g`, or `&amp`, etc... Thanks for pointing that out. –  Apr 26 '17 at 21:44
  • @sln, the solution I provided here still fails in some cases, see [my current answer](http://stackoverflow.com/a/43691002/6738706) instead. – Tomas Langkaas Apr 29 '17 at 13:06
0

I think you're referring to non-capturing groups: http://www.regular-expressions.info/brackets.html, which is lightly addressed in a few stack overflow posts (Why regular expression's "non-capturing" group is not working and Regular expression, "just group, don't capture", doesn't seem to work).

Namely, non-captured groups don't get a group selector of their own (e.g. /a(?:[X-Z])([a-c])/g would match "aZb" but \1 would equal "b", not "Z".

Community
  • 1
  • 1
Paurian
  • 1,163
  • 8
  • 17
0

Is this what you are trying to do?

var str = "&lt;img border-color=&#039;0&#039;"
console.log(str)
console.log(str.match(/((?:[a-z-]+=)?&#.+?;)/gi))
console.log(str.replace(/((?:[a-z-]+=)?&#.+?;)/gi, "|$1|"))
Tezra
  • 7,096
  • 2
  • 19
  • 59
0

I think it would be possible if we were able to use lookbacks. Given that the flavor of regex is JavaScript, here, I don't think we can. This gets pretty close: [^&;]*(string)[^&;]*(?!9;|t;|;)

ferrants
  • 586
  • 4
  • 11
0

Final Version Candidate

4/29

This version should handle partial entity at the end of the search string
where the partial has pre-entity characters Like xxx&yyy or a&#00 etc..

This was the last case discovered by @TomasLangkaas.
Given all the other cases were covered, this is a final release candidate
for @athancahill or anybody else interested.

(See comments and previous versions)

The model was changed from String.Replace() to while( match = Rx.exec() )

Explained here, but see the JS code for implementation.
It still uses the search string as the first alternation
with the entity as the second.

    (?=
         # This is the optional entity captured at
         # the same position where the search string starts.
         # If this entity matches, it means the search string
         # matches. Either one may be a partial of the other.

         # (1) The container for pre-entity / entity
         (                             
              # (2) Pre-entity characters 
              ( sLongest )                

              # Entity   
              (?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);
         )?                            
    )

    # (3) The search string ( consumes )
    ( sToFind )                        
 | 

    # (4) Or, the entity last  ( consumes ) 
    ( (?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*); )

Be warned, you can't just break up entities syntax as parts of a regex.
It must be matched entirely, as a distinct item
(been down this road a hundred times, it can't be done..).

Note that this is a single pass, pure regex solution and is very fast.
If you take out all the comments, it's really only a few lines of code.
You can modify the entities sub-expression and use whatever you want.
The code structure won't need to change.

//=========================================================
// http://jsfiddle.net/b4b28a38/95/
//=========================================================

// ------------------------
// These are only used when pre-entity partials are detected
var RxEntPartial = new RegExp( '(?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]*)?))?|%(?:[a-z_:][a-zd_:.-]*)?)$', 'ig' );
var RxEntFull = new RegExp( '(?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);', 'ig' );
// ------------------------

function MakeRegex( FindAry ) {
   // Escape metachars
     var longest = 0;
     for (var i = 0; i < FindAry.length; i++ )
     {
         if ( FindAry[i].length > longest )
            longest = FindAry[i].length;
         FindAry[i] = FindAry[i].replace(/(?!\s)\W/g, "\\$&"); 
     }
   // Make 'longest' sub-expression
     longest -= 1; 
     var sLongest = '';
     if ( longest > 0 )
         sLongest = '.{0,' + longest.toString() + '}?';
   // Join array using alternations
     var sToFind = FindAry.join('|');
   // Return new regex object
     var rx =  new RegExp( '(?=((' + sLongest + ')(?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);)?)(' + sToFind + ')|((?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);)',  
   'ig');
   //console.log( rx);
   return rx;
}  


function GetReplace( str, Rx )
{
   var sResult = '';    // New modified string to return
   var _M;              // Match object
   var ndxLast = 0;     // Previous Rx.lastIndex value when valid match
                        // ( where next match starts )

   Rx.lastIndex = 0;
   
   while ( _M = Rx.exec( str ) )
   {
       // Alternation 1: (1) = container (optiopnal), p2 = pre-entity, entity, p3 = search string
       // Alternation 2: p4 = entity
       // Form:      
       //     (?=
       //          (                    # (1) start container
       //            ( pre-entity )            # (2)
       //            entity
       //          )?                       # (1) end
       //     )
       //     ( search )                 # (3)
       //  |  
       //     ( entity )                 # (4)
       
       if ( _M[4] )
       {
          // Entity, continue unchanged.
          sResult += str.substr( ndxLast , _M.index - ndxLast ) + _M[4];
          ndxLast = Rx.lastIndex;
          continue;
       }
       // Check if entity container captured inside zero length assertion matched 
       if ( _M[1] )
       {
           // Get some lengths 
      
           var L1 = _M[1].length;
           var L2 = _M[2].length;
           var L3 = _M[3].length;

           if ( L1 == L3 )
           {
              // Ok - This means it matched a trailing full entity
              // Intended, modify the search string
              sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
              ndxLast = Rx.lastIndex;
              continue;
           }

           // Pre entity check  ( pre-entity ) 
           if ( L2 > 0 )  
           {
               // This is a rare case and should not slow anything down.
               // End entity condition to check

               var sMatched = _M[3];
               var mpartial;
               RxEntPartial.lastIndex = 0;

               // Verify the match had a partial entity at the end
               if ( mpartial = RxEntPartial.exec( sMatched ) )
               {
                   // Check partial entity is not at the  beginning                   
                   if ( mpartial.index > 0 )
                   {
                       // Location in target string to check
                       // for a full entity.
                       var loc = _M.index + mpartial.index;

                       // Assure there is no full entity
                       RxEntFull.lastIndex = loc;
                       var mfull;
                       if ( mfull = RxEntFull.exec( str ) )
                       {
                           if ( mfull.index == loc )
                           {
                               // Not valid, move past it
                               RxEntFull.lastIndex = 0;
                               Rx.lastIndex += (L1 - L3);
                               continue;
                           }
                       }
                  }
               }
               // Ok - This definitely passes.
               // Intended, modify the search string
               sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
               ndxLast = Rx.lastIndex;
               continue;
           }

           // Normal checks
           // -------------------------------------------------------

           // If the length of the search >= the entity length
           // then the search includes an entity at the begining
       

           if ( L3 >= L1 )
           {
              // Intended, modify the search string
              sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
              ndxLast = Rx.lastIndex;
              continue;
           }

          // Uh oh, the search is a partial entity (from the beginning).
          // Since we see that it is part of an entity, we have to go past it.
          // The match position reflects the partial entity.
          // Adjust (advance) the match position by the difference
          // to go past the entity.

          Rx.lastIndex += ( L1 - L3 );
          continue;
       }

       // Here, the search string is pure, just modify it
       sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
       ndxLast = Rx.lastIndex;
   }
   sResult += str.substr( ndxLast , str.length - ndxLast );
   return sResult;
}

var TargetStr = "39&lt;img border=39&#039;&gt&gt&&#039 t; xx&gt x&gt; 0&# r&#039;&39; cad&#092; r&#FFd0&#22 /&gtttttt;  39; end";
console.log( 'Target:\r\n' + TargetStr );

// Always put the longest first of/if alphabetically sorted (ie. aaa|aa|a, etc ..)

var rx = MakeRegex( ['39', '&lt;img', 'cad&#092;', '&gt', 't;', 'x&', '0&#', 'r&#'] );

var NewString = GetReplace( TargetStr, rx );

console.log('Find any of:  39, &lt;img, cad&#092;, &gt, t;, x&, 0&#, r&#' );
console.log( NewString );

Output

 Target:
 39&lt;img border=39&#039;&gt&gt&&#039 t; xx&gt x&gt; 0&# r&#039;&39; cad&#092; r&#FFd0&#22 /&gtttttt;  39; end

 Find any of:  39, &lt;img, cad&#092;, &gt, t;, x&, 0&#, r&#

 [39][&lt;img] border=[39]&#039;[&gt][&gt]&&#0[39] [t;] x[x&]gt x&gt; [0&#] r&#039;&[39]; [cad&#092;] [r&#]FFd[0&#]22 /&gtttttt;  [39]; end
  • Thanks @sln, this is very insightful. Also, thanks for your comments pointing out flaws in other answers. Lots of things that I hadn't considered. – nathancahill Apr 24 '17 at 19:23
  • However, this doesn't work if the string we're trying to match includes a full escape sequence, like `<img` instead of `39` – nathancahill Apr 24 '17 at 19:28
  • @sln, your approach currently fails on searching for `"&g"` or `"t;"`, I recommend capturing entities before target strings, as demonstrated and discussed in the comments of [my answer](http://stackoverflow.com/a/43577524/6738706) – Tomas Langkaas Apr 25 '17 at 21:40
  • @sln, try changing the line `Rx = MakeRegex( ['39'] );` to `Rx = MakeRegex( ['&g'] );` – Tomas Langkaas Apr 25 '17 at 21:45
  • @sln, try `Rx = MakeRegex( ['&g', 't;'] );` – Tomas Langkaas Apr 25 '17 at 21:59
  • @TomasLangkaas - So, you're saying with the search string, anything short of a possible entity, should be examined. –  Apr 25 '17 at 22:18
  • @TomasLangkaas - Yep, so if the search string is a entity beginning and only a partial, it would be matched first, and that wouldn't be correct. However if the entity expression is first in the regex, then it wouldn't match an extended entity search like `<img`. –  Apr 25 '17 at 22:46
  • @TomasLangkaas - It might be solved by putting a entity lookahead that captures the entity just before the search literal. `((?=(entity)?)search)|(entity)` then checking if the length of group 1 > group 2. If so, have to return group 2 unchanged, then advance the match position by the difference, or match the string difference. But, doable. Another possibility, is to let the caller specify the order, eh ? –  Apr 25 '17 at 22:55
  • _Final answer !_ –  Apr 26 '17 at 00:34
  • @sln Your solution matches string if it's full substring between `&` and `;`. It's visible in your example for `39`: `&39;` -> `&[39];` – Oskar Apr 26 '17 at 06:42
  • @Oskar - According to the specs, _Entity References_ defines legal characters [EntityRef ::= '&' Name ';'](http://www.w3.org/TR/1998/REC-xml-19980210#sec-physical-struct). If you follow the definition of _Name_ [Name ::= (Letter | '_' | ':') (Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender)*](http://www.w3.org/TR/1998/REC-xml-19980210#NT-Name) you see that the first letter must be a _base_ character [Letter ::= BaseChar | Ideographic](http://www.w3.org/TR/1998/REC-xml-19980210#NT-Letter). There is no digit allowed for the first letter. Thus, `&39;` is not legal.. –  Apr 26 '17 at 11:35
  • In the _Final answer_, I implemented the `((?=(entity)?)search)|(entity)` single pass approach It works for all cases now. Last update just uses a better regex for entities, from the xml specs. If interested, the ascii version `(?i)(?:&(?:[a-z_:][a-z\d_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-z\d_:.-]*);`. The equivalent Unicode version is `(?i)(?:&(?:[\p{Ll}\p{Lu}\p{Lo}\p{Lt}\p{Nl}_:][\p{Ll}\p{Lu}\p{Lo}\p{Lt}\p{Nl}\p{Mc}\p{Me}\p{Mn}\p{Lm}\p{Nd}_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[\p{Ll}\p{Lu}\p{Lo}\p{Lt}\p{Nl}_:][\p{Ll}\p{Lu}\p{Lo}\p{Lt}\p{Nl}\p{Mc}\p{Me}\p{Mn}\p{Lm}\p{Nd}_:.-]);` –  Apr 26 '17 at 21:48
  • Super interesting. This works really well, and benchmarks the fastest of all the answers here. Thanks for your help, and your comments on the other answers too. – nathancahill Apr 27 '17 at 03:57
  • @TomasLangkaas - Interesting. I'll try to do a fix for that case. Have a few ideas. Busy this morning, be back later. –  Apr 27 '17 at 12:41
  • After further checking, `Rx.lastIndex` only works during an `exec` or `test` function call. So, my current solution can't change it, and it works by coincidence. Giving back the +200. This could be done the way it is but the replacement would have to be done by building up a new string via 'exec()`. Found this out while testing a solution for @TomasLangkaas. Anyway, good luck, keep trying. –  Apr 28 '17 at 00:05
  • Do you have a test case that your solution fails for? – nathancahill Apr 28 '17 at 18:12
  • @nathancahill - I'm just updating my solution with a Regex.exec() version, so that the `Regex.lastIndex` is valid. On the previous version though, yeah add `&gttttt;` to the target string and you will see it. I have one more version to do after this latest _exec()_ one. I believe it's the final case to fix. See @TomasLangkaas comment re: `Rx = MakeRegex( ['39&', '0'] );`. Funny thing is I had this solution already done before I discovered the defective `Rx.lastIndex` as it pertains to string.replace(). So, I just have to convert it and post it later. –  Apr 28 '17 at 18:50
  • @nathancahill - I posted the final version that handles `39&`, `0` cases. It's a modified version of the previous one. –  Apr 29 '17 at 22:20
  • @sln, nice work. You may consider to test it against the full set of test cases I made. With `.exec()` comes more complex code and less use of regex features. For a fast and pure regex approach, see my latest answer. – Tomas Langkaas Apr 29 '17 at 23:02
  • @TomasLangkaas - I'm looking at your last answer now. Pretty good! Fwiw- I didn't want to post extra code for test cases in my answer since its already bloated with comments. But, here is a JSfiddle of it http://jsfiddle.net/gbu45rkf/1/. –  Apr 30 '17 at 23:55
0

UPDATE 2017-04-28
adding 39& and 0&# test case - no code change needed - wow, that was lucky :)

Important to note that Im deliberately not allowing ampersand to exist in the text to be searched except as the start of an escaped sequence, whether its a valid escape sequence or not ie im allowing &39; to be an escape sequence even though technically invalid. This makes it easy to say that if we are asked to find a string with an ampersand which is not part of a complete escape sequence (ie like 0&#') then this should not be matched, and is an invalid search string. The use ofAmpExcape` below to do this, rather than terminate and return the unchanged string, is a convenience since other use-cases of this regex (ie outside JavaScript) do not allow me conditional branching statements (or function callbacks on matches for that matter). For my purpose, this comes closet to the definition Im working against for escaped HTML.

UPDATE 2017-04-26
adding &g test case and editing answer/code for that case by escaping the find string

UPDATE 2017-04-25

A two pass regex solution:

function do_replace(test_str, find_str, replace_str) {
        let escaped_find_str = AmpExcape(find_str);
        escaped_find_str = RegExcape(escaped_find_str);
        let escaped_replace_str = RegExcape(replace_str);

        let first_regex = new RegExp('(' + escaped_find_str + ')|(&\\w*;|&#[0-9a-fA-F]*;)','gi');
        let first_pass = test_str.replace(first_regex,'&&$1'+replace_str+'&&$2');
        let second_regex = new RegExp('&&(?:'+escaped_replace_str+'&&|(' + escaped_find_str + ')?('+escaped_replace_str + ')?&&)','gi');
        let second_pass = first_pass.replace(second_regex,'$2');

        return second_pass;
}

I was looking for an approach that used regex only and whilst the solutions by @sln and @tomas-langkaas were useful (well, mind blowing), I still wanted a method that I could use in JS and beyond.

I have something thats working for my test cases but it is forced to do multiple passes over the text. Originally I had three passes but I think I have it working at 2 passes. Obviously not as efficient as the other answers.

Ive attempting to match: <target string> | <entity> as those answers did and flipped the order to be entity first as @sln did in the latest update. But im using && as a 'new' escape sequence. The strategy is:

  • Initially we escape the find and replacement strings for their use in regex. for the <target string> if there is an & that is not part of a complte entity, then we replace it with a double && with the aim of preventing it being matched ever ie Im considering the search for a partial entity to never be possible in escaped HTML

  • the first pass replaces any matches (of either entities or target strings) with

    &&<target group value><replacement string>&&<entity group value>
    
  • on a match of the target string the <entity group value> will be blank, and we will return the && escape, the target string, the replacement string, and a final && escape

  • on a match of an entity, the <target group value> will now be empty, so we end up returning &&<replacement str>&& followed by the entity value.
  • the second pass can look for all occurrences of &&<target string><replacement string>&& and replace with <replacement string>
  • also in the second pass we can look for &&<replacement string>&& and know it is supposed to be replaced with blank.
  • we dont have to do anything for entity matches this time as we have left them untouched in pass 1

Heres the complete code with test cases (attribution for RexExcape maintained from @tomas-langkaas):

// helper for building regexes from strings
// http://stackoverflow.com/a/3561711/6738706
function AmpExcape(str) {
        return str.replace(/&(\w*;|#[0-9a-fA-F]*;)|(&)/g, '&$2$1');
}
function RegExcape(str) {
        return str.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
};
function do_replace(test_str, find_str, replace_str) {
        let escaped_find_str = AmpExcape(find_str);
        escaped_find_str = RegExcape(escaped_find_str);
        let escaped_replace_str = RegExcape(replace_str);

        let first_regex = new RegExp('(' + escaped_find_str + ')|(&\\w*;|&#[0-9a-fA-F]*;)','gi');
        let first_pass = test_str.replace(first_regex,'&&$1'+replace_str+'&&$2');
        let second_regex = new RegExp('&&(?:'+escaped_replace_str+'&&|(' + escaped_find_str + ')?('+escaped_replace_str + ')?&&)','gi');
        let second_pass = first_pass.replace(second_regex,'$2');

        return second_pass;
}
let str = '39&lt;img style=&#039;width: 39;&#039; bor9;wder=39&#039;0&#039;&39; /&gt;39;';
let test_list = ['39','39;','9;','9;w','39&','0&#'];
run_test(str,test_list);

str = '&lt;img border=&#039;0&#039; /$gt;';
test_list = ['lt','39','&#039;','border=&#039;'];
run_test(str,test_list);

str = 'test string ring ring';
test_list = ['ring'];
run_test(str,test_list);

str = '39&lt;img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39;';
test_list = ['border','0','&#039;','39','lt','&lt;','border=&#039;','&lt;img','&g','t;'];
run_test(str,test_list);

function run_test(base_str, find_list) {

        let orig_str = 'original';

        let max_len = find_list.concat(orig_str).reduce(function(a,b) {
                return a > b.length ? a : b.length;
        },0);
        console.log();
        console.log(pad(orig_str,max_len) + ': ' + str);

        find_list.map(function(gstr) {
                console.log( pad( gstr, max_len) + ': ' + do_replace(str, gstr, '|' + gstr + '|'));
        });
}
function pad(str,len) {
        while ( str.length < len) { str = str + ' ' };
        return str;
}

and the output

original: 39&lt;img style=&#039;width: 39;&#039; bor9;wder=39&#039;0&#039;&39; /&gt;39;
39      : |39|&lt;img style=&#039;width: |39|;&#039; bor9;wder=|39|&#039;0&#039;&39; /&gt;|39|;
39;     : 39&lt;img style=&#039;width: |39;|&#039; bor9;wder=39&#039;0&#039;&39; /&gt;|39;|
9;      : 39&lt;img style=&#039;width: 3|9;|&#039; bor|9;|wder=39&#039;0&#039;&39; /&gt;3|9;|
9;w     : 39&lt;img style=&#039;width: 39;&#039; bor|9;w|der=39&#039;0&#039;&39; /&gt;39;
39&     : 39&lt;img style=&#039;width: 39;&#039; bor9;wder=39&#039;0&#039;&39; /&gt;39;
0&#     : 39&lt;img style=&#039;width: 39;&#039; bor9;wder=39&#039;0&#039;&39; /&gt;39;

original     : &lt;img border=&#039;0&#039; /$gt;
lt           : &lt;img border=&#039;0&#039; /$gt;
39           : &lt;img border=&#039;0&#039; /$gt;
&#039;       : &lt;img border=|&#039;|0|&#039;| /$gt;
border=&#039;: &lt;img |border=&#039;|0&#039; /$gt;

original: test string ring ring
ring    : test st|ring| |ring| |ring|

original     : 39&lt;img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39;
border       : 39&lt;img style=&#039;width: 39;&#039; |border|=&#039;0&#039;&39; /&gt;39;
0            : 39&lt;img style=&#039;width: 39;&#039; border=&#039;|0|&#039;&39; /&gt;39;
&#039;       : 39&lt;img style=|&#039;|width: 39;|&#039;| border=|&#039;|0|&#039;|&39; /&gt;39;
39           : |39|&lt;img style=&#039;width: |39|;&#039; border=&#039;0&#039;&39; /&gt;|39|;
lt           : 39&lt;img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39;
&lt;         : 39|&lt;|img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39;
border=&#039;: 39&lt;img style=&#039;width: 39;&#039; |border=&#039;|0&#039;&39; /&gt;39;
&lt;img      : 39|&lt;img| style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39;
&g           : 39&lt;img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39;
t;           : 39&lt;img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39;

UPDATE 2017-04-24

I abandoned the approach below for a couple of reasons but most importantly it will only find the first occurrence of the find string in a run of text that has no ampersands or semi-colons. For instance the string test string ring ring will only be matched as test st|ring| ring ring if the match is for ring - this seems pretty useless for a a find and replace - Im updating the answer so it works for matching ring at least the first time as I previously missed the line start as an allowed terminal character but i dont consider this a valid solution for all possible texts.

ORIGINAL ANSWER (with modifications)

If you care about semi-colon appearing in the text other than as a terminal character to a corresponding &, like for inline styles where it might say style="width: 39;", then you need something a bit complicated:

'((?:^|(?!(?:[^&;]+' + str + ')))(?=(?:(?:&|;|^)[^&;]*))(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))?)(' + str + ')'

The code presented further down will output the original test cases:

"original":      &lt;img border=&#039;0&#039; /$gt;
"lt":            &lt;img border=&#039;0&#039; /$gt;
"39":            &lt;img border=&#039;0&#039; /$gt;
"&#039;":        &lt;img border=|&#039;|0|&#039;| /$gt;
"border=&#039;": &lt;img |border=&#039;|0&#039; /$gt;

It also shows the specific output against semi-colons appearing in the text and search terms.

"original":      39&lt;img style=&#039;width: 39;&#039; bor;der=39&#039;0&#039;&39; /&gt;39;
test string that may be followed by semi-colon :
|39|&lt;img style=&#039;width: |39|;&#039; bor;der=|39|&#039;0&#039;&39; /&gt;|39|;
test match with semi-colon:
39&lt;img style=&#039;width: |39;|&#039; bor;der=39&#039;0&#039;&39; /&gt;|39;|
test match with semi-colon mid string
39&lt;img style=&#039;width: 39;&#039; |bor;der|=39&#039;0&#039;&39; /&gt;39;

NOTE And this is the example of where the approach falls apart:

"original":      test string ring ring
test st|ring| ring ring

There is no lookbehind that would allow a 'memory' effect to differentiate between zero characters after a previous match and zero characters, but within an escaped sequence. So it would be impossible to match the second occurrence of ringwithout also matching 39 in the string &#039;

Heres the example code:

function make_regex(str) {
  let regexp = new RegExp('((?:^|(?!(?:[^&;]+' + str + ')))(?=(?:(?:&|;|^)[^&;]*))(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))?)(' + str + ')','gi');
  return regexp;
}

function do_replace(test_str, find_str, replace_str) {
        let new_reg = make_regex(find_str);
        return  test_str.replace(new_reg,'$1' + replace_str);
}

let str = '39&lt;img style=&#039;width: 39;&#039; bor;der=39&#039;0&#039;&39; /&gt;39;';
console.log();
console.log('"original":     ', str);
console.log('test string that may be followed by semi-colon :');
console.log(do_replace(str, '39', '|$2|' ));
console.log('test match with semi-colon:');
console.log(do_replace(str, '39;', '|39;|' ));
console.log('test match with semi-colon mid string');
console.log(do_replace(str, 'bor;der', '|bor;der|' ))

str = '&lt;img border=&#039;0&#039; /$gt;';
console.log();
console.log('"original":     ', str);
console.log('"lt":           ', do_replace(str, 'lt', '|lt|' ));
console.log('"39":           ', do_replace(str, '39', '|39|' ));
console.log('"&#039;":       ', do_replace(str, '&#039;', '|&#039;|' ));
console.log('"border=&#039;":', do_replace(str, 'border=&#039;', '|border=&#039;|' ));
str = 'test string ring ring';
console.log();
console.log('"original":     ', str);
console.log(do_replace(str, 'ring', '|$2|')); 

Its important to note that the regex captures not only the text you want but the chunk of text before this. This affects your use of $1 as a replacement value since the text you want is now $2

The explanation of the terms is not straightforward but it can be broken down into:

A negative lookahead including string to find that prevents beginning the matching on non-terminal characters

(?:^|(?!(?:[^&;]+' + str + ')))

A positive lookahead that forces the matching to begin on a terminal character, or the start of the line

(?=(?:(?:&|;|^)[^&;]*))

A negative lookahead that prevents the matching from starting on an & but allows the line start or a previous semi-colon

(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))?

And then finally the string to match

The effect is that we match the whole section of the text that contains the text we want, from the previous terminal value, to the end of the string to match. Because of this, javascript ends up with matches that are the full chunk that was matched. For instance, when we ask for border=&#039; we will end up with the chunk ;img border=&#039;. So we define two capture groups, one for the part of the chunk we are not interested in and one for our match. This allows us to use $1$2 to recreate the string or $1whatever to replace only the part we want. We can then use str.replace() with this strategy

spacepickle
  • 2,429
  • 12
  • 21
  • 1
    This won't match anything with an ampersand in front of it. `& asdffffasdfasdfasdfasad l 39 ` and only matches if there is a semi-colon behind it without an intervening ampersand. And, it totally disregards the structural validity of Character and Entity References . –  Apr 23 '17 at 20:23
  • @sln Not saying my answer cant be improved but... Im actually basing my answer on the structural validity of escaped HTML: that is, any opening ampersand must have a closing semi-colon following it. Your example is not valid escaped HTML so not relevant. Any subtring that you took that was of the form you presented would have needed to truncate before a semi-colon further on in the text. In that circumstance, not matching would be the correct behavior. Did I misunderstand or can you think of a valid escaped HTML substring where my assumption is not true? – spacepickle Apr 24 '17 at 03:22
  • Maybe I'm missing something with the two capture groups, but I'm not able to get this to work with simple strings like `do_replace('test string', 'ring', '$1|$2|')` – nathancahill Apr 24 '17 at 16:00
  • @nathancahill - yep thats not workinh - i forgot to allow the start of line and I notcied as well Ive got the greedy macth for non-terminal chacrters - ill update my answer – spacepickle Apr 24 '17 at 16:36
  • @nathancahill ill post another answer with a different approach – spacepickle Apr 24 '17 at 17:46
  • Thanks @spacepickle, going to try it out. – nathancahill Apr 26 '17 at 19:11