0

question:

I need to automatically extract all the name properties from this JavaScript (separately for providers large and providers small)

/*
    Simple OpenID Plugin
    http://code.google.com/p/openid-selector/

    This code is licensed under the New BSD License.
*/

var providers_large = {
    google : {
        name : 'Google',
        url : 'https://www.google.com/accounts/o8/id'
    },
    yahoo : {
        name : 'Yahoo',
        url : 'http://me.yahoo.com/'
    },
    aol : {
        name : 'AOL',
        label : 'Enter your AOL screenname.',
        url : 'http://openid.aol.com/{username}'
    },
    myopenid : {
        name : 'MyOpenID',
        label : 'Enter your MyOpenID username.',
        url : 'http://{username}.myopenid.com/'
    },
    openid : {
        name : 'OpenID',
        label : 'Enter your OpenID.',
        url : null
    }
};

var providers_small = {
    livejournal : {
        name : 'LiveJournal',
        label : 'Enter your Livejournal username.',
        url : 'http://{username}.livejournal.com/'
    },
    /* flickr: {
        name: 'Flickr',        
        label: 'Enter your Flickr username.',
        url: 'http://flickr.com/{username}/'
    }, */
    /* technorati: {
        name: 'Technorati',
        label: 'Enter your Technorati username.',
        url: 'http://technorati.com/people/technorati/{username}/'
    }, */
    wordpress : {
        name : 'Wordpress',
        label : 'Enter your Wordpress.com username.',
        url : 'http://{username}.wordpress.com/'
    },
    blogger : {
        name : 'Blogger',
        label : 'Your Blogger account',
        url : 'http://{username}.blogspot.com/'
    },
    verisign : {
        name : 'Verisign',
        label : 'Your Verisign username',
        url : 'http://{username}.pip.verisignlabs.com/'
    },
    /* vidoop: {
        name: 'Vidoop',
        label: 'Your Vidoop username',
        url: 'http://{username}.myvidoop.com/'
    }, */
    /* launchpad: {
        name: 'Launchpad',
        label: 'Your Launchpad username',
        url: 'https://launchpad.net/~{username}'
    }, */
    claimid : {
        name : 'ClaimID',
        label : 'Your ClaimID username',
        url : 'http://claimid.com/{username}'
    },
    clickpass : {
        name : 'ClickPass',
        label : 'Enter your ClickPass username',
        url : 'http://clickpass.com/public/{username}'
    },
    google_profile : {
        name : 'Google Profile',
        label : 'Enter your Google Profile username',
        url : 'http://www.google.com/profiles/{username}'
    }
};

openid.locale = 'en';
openid.sprite = 'en'; // reused in german& japan localization
openid.demo_text = 'In client demo mode. Normally would have submitted OpenID:';
openid.signin_text = 'Sign-In';
openid.image_title = 'log in with {provider}';

So I need to: A) Remove all the C-Style comments and B) Get all the name values for [providers_large, providers_small] (after the comments have been removed)

So far I have tried regex to remove C-Style comments (and failed) and regex to get everything between curly braces (and failed)

I subsequently tried to read it in as JSON, but this of course failed with "invalid json primitve whatever"

This are the stackoverflow-sites I uses and this are my examples I tried so far

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;


namespace ConsoleExperiments
{

    public class Program
    {

        // http://stackoverflow.com/questions/2538279/strip-out-c-style-multi-line-comments
        // NOT working
        static string RemoveCstyleComments(string strInput)
        {
            string strPattern = @"/[*][\w\d\s]+[*]/";
            //strPattern = @"/\*.*?\*/";
            strPattern = "/\\*.*?\\*/";

            string strOutput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, string.Empty, System.Text.RegularExpressions.RegexOptions.Multiline);
            Console.WriteLine(strOutput);
            return strOutput;
        }


        // http://stackoverflow.com/questions/413071/regex-to-get-string-between-curly-braces-i-want-whats-between-the-curly-brace
        // http://stackoverflow.com/questions/5337166/regular-expression-get-string-between-curly-braces
        // http://stackoverflow.com/questions/1904617/regex-for-removing-curly-brackets-with-nested-curly-brackets
        // http://stackoverflow.com/questions/378415/how-do-i-extract-a-string-of-text-that-lies-between-two-brackets-using-net
        static string GetCurlyValues(string strInput)
        {
            string strPattern = "/{(.*?)}/";
            strPattern = "/{([^}]*)}/";
            strPattern = @"\{(\s*?.*?)*?\}";
            strPattern = @"(?<=\{).*(?=\})";
            strPattern = "{(.*{(.*)}.*)}";
            strPattern = "{{([^}]*)}}";
            strPattern = "{{({?}?[^{}])*}}";
            strPattern = @"\(([^)]*)\)";

            System.Text.RegularExpressions.Regex rex = new System.Text.RegularExpressions.Regex(strPattern, System.Text.RegularExpressions.RegexOptions.Multiline);

            System.Text.RegularExpressions.Match mMatch = rex.Match(strInput);

            foreach (System.Text.RegularExpressions.Group g in mMatch.Groups)
            {
                Console.WriteLine("Group: " + g.Value);
                foreach (System.Text.RegularExpressions.Capture c in g.Captures)
                {
                    Console.WriteLine("Capture: " + c.Value);
                }
            }

            return "";
        }


        static void ReadFile()
        {
            try
            {
                string strFilePath = @"TestFile.txt";
                if (System.IO.File.Exists(strFilePath))
                {
                    // Create an instance of StreamReader to read from a file.
                    // The using statement also closes the StreamReader.
                    using (System.IO.StreamReader sr = new System.IO.StreamReader(strFilePath))
                    {
                        string line;
                        // Read and display lines from the file until the end of
                        // the file is reached.
                        while ((line = sr.ReadLine()) != null)
                        {
                            Console.WriteLine(line);
                        } // Whend

                        sr.Close();
                    } // End Using

                } // End if (System.IO.File.Exists(strFilePath))
                else
                    Console.WriteLine("File \"" + strFilePath + "\" does not exist.");
            } // End Try
            catch (Exception e)
            {
                // Let the user know what went wrong.
                Console.WriteLine("The file could not be read:");
                Console.WriteLine(e.Message);
            } // End Catch

        } // End Sub

        public class cProvider
        {
            public string name = "abc";
            public string label ="def";
            public string url ="url";
        }


        public class cProviders_large
        {
            public List<cProvider> foo = new List<cProvider>();
        }


        static void Main(string[] args)
        {
            string strContent = System.IO.File.ReadAllText(@"D:\UserName\Downloads\openid-selector-1.3\openid-selector\js\openid-en - Kopie.js.txt");
            Console.WriteLine(strContent);
            //RemoveCstyleComments(strContent);
            //GetCurlyValues(strContent);
            System.Web.Script.Serialization.JavaScriptSerializer js = new System.Web.Script.Serialization.JavaScriptSerializer();
            //object obj = js.DeserializeObject(strContent);

            cProviders_large xx = new cProviders_large();
            cProvider ap = new cProvider();
            xx.foo.Add(ap);
            xx.foo.Add(ap);

            string res = js.Serialize(xx);
            Console.WriteLine(res);


            Console.WriteLine(Environment.NewLine);
            Console.WriteLine(" --- Press any key to continue --- ");
            Console.ReadKey();
        } // End Sub Main

    } // End Class Program


} // End namespace ConsoleExperiments

Could anybody who understands regex better than me provide me with the necessary regex-expressions ? Right now, it looks like I will end-up doing it by hand every time the file changes, and I really really hate this...

Edit: On a sidenote, the v8 wrapper uses C++.NET, and thus doesn't work on Linux, although the v8 engine does work very well on Linux.

So I'm sticking to solving the problem via JSON conversion.

Stefan Steiger
  • 68,404
  • 63
  • 337
  • 408
  • 1
    I believe the problem is equivalent to this one: http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454 JavaScript is not a regular language. Removing the comments should be possible but will it be useful if you can't do the rest? – Stilgar Nov 30 '11 at 09:25
  • @Stilgar: Actually, doing the rest is less complicated than properly removing the comments. I am already 90 to 95 percent throught with the rest. – Stefan Steiger Nov 30 '11 at 19:10

3 Answers3

4

You could use a javascript engine:

using System;
using System.IO;
using Noesis.Javascript;

class Program
{
    static void Main()
    {
        var context = new JavascriptContext();
        context.SetParameter("openid", new object());
        context.Run(File.ReadAllText("test.js"));
        dynamic providers_large = context.GetParameter("providers_large");
        foreach (var provider in providers_large)
        {
            Console.WriteLine(
                "name: {0}, url: {1}", 
                provider.Value["name"], 
                provider.Value["url"]
            );
        }
    }
}

prints the following on my console:

name: Google, url: https://www.google.com/accounts/o8/id
name: Yahoo, url: http://me.yahoo.com/
name: AOL, url: http://openid.aol.com/{username}
name: MyOpenID, url: http://{username}.myopenid.com/
name: OpenID, url:
Darin Dimitrov
  • 960,118
  • 257
  • 3,196
  • 2,876
0

Consider the JavaScriptSerializer for this, provides json deserialization, If you remove the vars and comments it should be able to create an object graph.

Bas
  • 25,270
  • 7
  • 45
  • 82
0

Darin Dimitrov's answer is certainly the most simple.
However, Noesis.Javascript is most annoyingly written in C++.NET, which means it cannot be compiled on Linux, although both C#/.NET (via mono) and the v8 engine run excellently on Linux.

So here is the workout via conversion to JSON and deserialzation:

static string RemoveCstyleComments(string strInput)
        {
            string strPattern = @"/[*][\w\d\s]+[*]/";
            //strPattern = @"/\*.*?\*/"; // Doesn't work
            //strPattern = "/\\*.*?\\*/"; // Doesn't work
            //strPattern = @"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/ "; // Doesn't work
            //strPattern = @"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/ "; // Doesn't work

            // http://stackoverflow.com/questions/462843/improving-fixing-a-regex-for-c-style-block-comments
            strPattern = @"/\*(?>(?:(?>[^*]+)|\*(?!/))*)\*/";  // Works !

            string strOutput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, string.Empty, System.Text.RegularExpressions.RegexOptions.Multiline);
            Console.WriteLine(strOutput);
            return strOutput;
        } // End Function RemoveCstyleComments




        static string ReplaceVariables(string strInput)
    {
        string strPattern = @"var\s+providers_large(\s+)?=(\s+)?{(\s+)?";
        strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "\"providers_large\" : {" + Environment.NewLine, System.Text.RegularExpressions.RegexOptions.Multiline);

        strPattern = @"(\s+)?var\s+providers_small(\s+)?=(\s+)?{(\s+)?";
        strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, ",   \"providers_small\" : {" + Environment.NewLine, System.Text.RegularExpressions.RegexOptions.Multiline);

        strPattern = @"}(\s+)?;(\s+)?";
        strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "}" + Environment.NewLine, System.Text.RegularExpressions.RegexOptions.Multiline);

        strPattern = @"$(\s+)?(\w+)(\s+)?:(\s+)?{";
        strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "\"$2\" : {", System.Text.RegularExpressions.RegexOptions.Multiline);

        strPattern = @"name(\s+)?:(\s+)?'";
        strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "\"name\" : '", System.Text.RegularExpressions.RegexOptions.Multiline);

        strPattern = @"url(\s+)?:(\s+)?'";
        strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "\"url\" : '", System.Text.RegularExpressions.RegexOptions.Multiline);

        strPattern = @"label(\s+)?:(\s+)?'";
        strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "\"label\" : '", System.Text.RegularExpressions.RegexOptions.Multiline);


        strInput = strInput.Replace("'", "\"");


        strPattern = "openid\\.locale.*";
        //strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "", System.Text.RegularExpressions.RegexOptions.Multiline);
        strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "", System.Text.RegularExpressions.RegexOptions.Singleline);

        strPattern = null;

        /*
        string[] astrTrailingComments = {
                         @"openid\.locale"
                        ,@"openid\.sprite"
                        ,@"openid\.demo_text"
                        ,@"openid\.signin_text"
                        ,@"openid\.image_title"
        };

        foreach (string strThisPattern in astrTrailingComments)
        {
            strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strThisPattern + ".+", "", System.Text.RegularExpressions.RegexOptions.Multiline);
        } // Next strThisPattern
        */

        strInput = "{" + strInput + "}";

        //Console.WriteLine(strInput);
        return strInput;
    } // End Function ReplaceVariables


        static System.Collections.Specialized.NameValueCollection TrySerialize(string strInput)
        {
            strInput = RemoveCstyleComments(strInput);
            strInput = ReplaceVariables(strInput);

            System.Collections.Specialized.NameValueCollection nvc = new System.Collections.Specialized.NameValueCollection(StringComparer.OrdinalIgnoreCase);

            System.Web.Script.Serialization.JavaScriptSerializer js = new System.Web.Script.Serialization.JavaScriptSerializer();
            dynamic objScript = js.DeserializeObject(strInput);
            js = null;


            foreach (dynamic kvp in objScript)
            {
                dynamic dictValues = kvp.Value;

                //Console.WriteLine(Environment.NewLine);
                //Console.WriteLine(Environment.NewLine);
                //Console.WriteLine(kvp.Key);
                //Console.WriteLine(Environment.NewLine);

                foreach (string strMemberVariable in dictValues.Keys)
                {

                    if(StringComparer.OrdinalIgnoreCase.Equals(kvp.Key,"providers_small"))
                    {
                        nvc.Add("providers_small", strMemberVariable);
                    }


                    if(StringComparer.OrdinalIgnoreCase.Equals(kvp.Key,"providers_large"))
                    {
                        nvc.Add("providers_large", strMemberVariable);
                    }

                    //Console.WriteLine(strMemberVariable + ":");

                    dynamic MemberVariable = dictValues[strMemberVariable];
                    //Console.WriteLine(MemberVariable.GetType().ToString());

                    foreach (string strProperty in MemberVariable.Keys)
                    {
                        //Console.WriteLine(strValue);
                        dynamic objPropertyValue = MemberVariable[strProperty];

                        //if (objPropertyValue != null)
                        //Console.WriteLine("     - " + (strProperty + ":").PadRight(8, ' ') + objPropertyValue.ToString());
                    } // Next strProperty

                } // Next strMemberVariable

            } // Next kvp


            // Console.WriteLine("providers large: ");
            // Console.WriteLine(nvc["providers_large"]);

            // Console.WriteLine(Environment.NewLine);
            // Console.WriteLine("providers small: ");
            // Console.WriteLine(nvc["providers_small"]);

            return nvc;
        } // End Function TrySerialize


        public static void GetProviders()
        {
            string strContent = System.IO.File.ReadAllText(@"D:\UserName\Downloads\openid-selector-1.3\openid-selector\js\openid-en.js");
            strContent = System.IO.File.ReadAllText(@"D:\UserName\Downloads\openid-selector-1.3\openid-selector\js\openid-ru.js");
            //Console.WriteLine(strContent);

            //JavaScriptEngineTest(strContent);
            //GetCurlyValues(strContent);
            System.Collections.Specialized.NameValueCollection nvc = TrySerialize(strContent);

            Console.WriteLine(Environment.NewLine);
            Console.WriteLine("providers large: ");
            foreach (string strValue in nvc.GetValues("providers_large"))
            {
                Console.WriteLine("    " + strValue);
            } // Next strValue

            //System.Runtime.Serialization.Json.DataContractJsonSerializer dcjs = new System.Runtime.Serialization.Json.DataContractJsonSerializer();
            // The above is bullshit in unadulterated filth. ==> Use System.Web.Extensions instead

            Console.WriteLine(Environment.NewLine);
            Console.WriteLine("providers small: ");
            foreach (string strValue in nvc.GetValues("providers_small"))
            {
                Console.WriteLine("    " + strValue);
            } // Next strValue

        } // End Sub GetProviders
Stefan Steiger
  • 68,404
  • 63
  • 337
  • 408