Python Write Json file from url, python 3 adding \n and b'

Question

I am upgrading from python 2 to 3. This code works in python 2 but not in 3. When I run in python 3 it seems to not get the data into an actual json format plus add's \n and b'.

I believe my python 3 is writing my json file wrongly.

Code to extract json from web url:

def WebService_As_Source(Source_Id):
    dst_path = SOURCECONFIG.GLOBAL_WorkPath
    bdate = SOURCECONFIG.GLOBAL_DATE

    print ("Extracting from Web Service...\t\t" + str(datetime.datetime.now()))

    password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()


    uid = 'stack' #save lan your userid         ######################################################## enter UserId
    pwd = 'overflow'#save your lan password ######################################################## enter Password
    top_level_url = SOURCECONFIG.WebServices_URL(Source_Id)
    password_mgr.add_password(None, top_level_url, uid, pwd)
    handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
    # create "opener" (OpenerDirector instance)
    opener = urllib.request.build_opener(handler)
    # use the opener to fetch a URL
    opener.open(top_level_url)
    # Now all calls to urllib2.urlopen use our opener.
    urllib.request.install_opener(opener)

    req = urllib.request.Request(top_level_url)
    response = urllib.request.urlopen(req)
    data = response.readlines()

    otf = open(os.path.join(dst_path + Source_Id+".json"), "w+")  # open text file

    rowcount = 0
    for line in data:
        rowcount = rowcount + 1
        otf.write(str(line))

    otf.close()
    print(Source_Id +" json extracted.\t\t"+ str(datetime.datetime.now()))

Sample of my actual Json file python 3 produces:

b'[ {\n'b'  "filterFlag" : "",\n'b'  "lookup" : "",\n'b'  "rule" : "",\n'b'  "prefix" : "",\n'b'  "validBDRAppName" : "",\n'b'  "vendor" : {\n'b'    "bookId" : "40302539",\n'b'    "bookName" : "NYC",\n'b'    "bookStatus" : "ACTIVE",\n'b'    "commProductType" : "",\n'b'    "businessDate" : "2019-08-05",\n'b'    "endOfDay" : null,\n'b'    "excludeFromAggregation" : "FALSE",\n'b'    "geoLocation" : "",\n'b'    "isHoliday" : "",\n'b'    "isOSFIBook" : false,\n'b'    "legalEntity" : "",\n'b'    "location" : "",\n'b'    "logicalDate" : "",\n'b'    "regulatoryType" : "Trading",\n'b'    "reportingLineBookName" : "NYC",\n'b'    "reportingLinePathName" : "super/user",\n'b'    "riskFilterType" : "USA",\n'b'    "statusId" : "",\n'b'    "transit" : "",\n'b'    "l8n" : ""\n'b'  },\n'b'  "bdr" : {\n'b'    "bookId" : "7447",\n'b'    "bookName" : "NY",\n'b'    "bookTransit" : "92218",\n'b'    "bookStatus" : "ACTIVE",\n'b'    "owner" : "",\n'b'    "empId" : "",\n'b'    "purpose" : "Trading",\n'b'    "appName" : "STRATEGY",\n'b'    "appCode" : "STRATEGY",\n'b'    "transitDesc" : "TOR",\n'b'    "appCategory" : "Front Office",\n'b'    "bookAppId" : "49512",\n'b'    "bookAppName" : "NY",\n'b'    "deskName" : "USA",\n'b'    "product" : "",\n'b'    "asOfDate" : "2019-08-05",\n'b'    "legalEntity" : "CANADA",\n'b'    "bookAppSecondaryName" : "NY",\n'b'    "strategy" : "NY",\n'b'    "lhu" : "FCC3",\n'b'    "masterBookName" : "NY"\n'b'  }\n'b'}, {\n'b'  "filterFlag" : "",\n'b'  "lookup" : "",\n'b'  "rule" : "",\n'b'  "prefix" : "",\n'b'  "validBDRAppName" : "",\n'b'  "vendor" : {\n'b'    "bookId" : "40296540",\n'b'    "bookName" : "LDN",\n'b'    "bookStatus" : "ACTIVE",\n'b'    "commProductType" : "",\n'b'    "businessDate" : "2019-08-05",\n'b'    "endOfDay" : null,\n'b'    "excludeFromAggregation" : "FALSE",\n'b'    "geoLocation" : "",\n'b'    "isHoliday" : "",\n'b'    "isOSFIBook" : false,\n'b'    "legalEntity" : "",\n'b'    "location" : "",\n'b'    "logicalDate" : "",\n'b'    "regulatoryType" : "Trading",\n'b'    "reportingLineBookName" : "LDN",\n'b'    "reportingLinePathName" : "stack/overflow",\n'b'    "riskFilterType" : "NONE",\n'b'    "statusId" : "",\n'b'    "transit" : "",\n'b'    "l8n" : ""\n'b'  },\n'b'

Sample of my Json file python 2 products:

I ran my code to grab the json from the url in python 2 and it gives me the data in the actual json format and doesnt add the b''s and \n's.

[ {
  "filterFlag" : "",
  "lookup" : "",
  "rule" : "",
  "prefix" : "",
  "validBDRAppName" : "",
  "vendor" : {
    "bookId" : "40302539",
    "bookName" : "NYC",
    "bookStatus" : "ACTIVE",
    "commProductType" : "",
    "businessDate" : "2019-08-06",
    "endOfDay" : null,
    "excludeFromAggregation" : "FALSE",
    "geoLocation" : "",
    "isHoliday" : "",
    "isOSFIBook" : false,
    "legalEntity" : "",
    "location" : "",
    "logicalDate" : "",
    "regulatoryType" : "Trading",
    "reportingLineBookName" : "NYC",
    "reportingLinePathName" : "super/user",
    "riskFilterType" : "USA",
    "statusId" : "",
    "transit" : "",
    "l8n" : ""
  },
  "bdr" : {
    "bookId" : "7447",
    "bookName" : "NY",
    "bookTransit" : "92218",
    "bookStatus" : "ACTIVE",
    "owner" : "",
    "empId" : "",
    "purpose" : "Trading",
    "appName" : "STRATEGY",
    "appCode" : "STRATEGY",
    "transitDesc" : "TOR",
    "appCategory" : "Front Office",
    "bookAppId" : "49512",
    "bookAppName" : "NY",
    "deskName" : "USA",
    "product" : "",
    "asOfDate" : "2019-08-06",
    "legalEntity" : "CANADA",
    "bookAppSecondaryName" : "NY",
    "strategy" : "NY",
    "lhu" : "FCC3",
    "masterBookName" : "NY"
  }
}, {
  "filterFlag" : "",
  "lookup" : "",
  "rule" : "",
  "prefix" : "",
  "validBDRAppName" : "",
  "vendor" : {
    "bookId" : "40296540",
    "bookName" : "LDN",
    "bookStatus" : "ACTIVE",
    "commProductType" : "",
    "businessDate" : "2019-08-06",
    "endOfDay" : null,
    "excludeFromAggregation" : "FALSE",
    "geoLocation" : "",
    "isHoliday" : "",
    "isOSFIBook" : false,
    "legalEntity" : "",
    "location" : "",
    "logicalDate" : "",
    "regulatoryType" : "Trading",
    "reportingLineBookName" : "LDN",
    "reportingLinePathName" : "stack/overflow",
    "riskFilterType" : "NONE",
    "statusId" : "",
    "transit" : "",
    "l8n" : ""

can anyone help with this?

Andras Deak · Accepted Answer · 2019-08-07T14:50:29.897

The problem is that response.readlines() (where response = urllib.request.urlopen(url)) returns a list of bytes. In python 2 bytes and str are the same thing, but on python 3 this is no longer true. So when you did

        otf.write(str(line))

the str() call was a no-op on python 2, but on python 3 you called str on a bytes object. This is never what you want to do:

>>> import urllib 
... resp = urllib.request.urlopen('https://stackoverflow.com') 
... dat = resp.readlines() 
... first_line = dat[0] 
... print(type(first_line)) 
... print(repr(first_line))
... print(repr(str(first_line)))
<class 'bytes'>
b'<!DOCTYPE html>\r\n'
"b'<!DOCTYPE html>\\r\\n'"

As you can see, the first line is a bytes object, and str(first_line) is a string that literally starts with a b and some single quotes.

Instead what you have to do is decode your bytes according to its corresponding encoding. I'm not very familiar with web things so I don't know what the best way is to correctly guess the encoding used by the website you're making requests to, but I do know that the third-party requests library can give you a usually correctly decoded json directly from the response.

If with urllib you have to do the decoding manually you need something like

        otf.write(line.decode('utf8'))

score 2 · Answer 2 · answered Aug 07 '19 at 14:43

Python 2 didn't differentiate between byte strings and unicode strings. Python 3 does, which is what the b'' is denoting.

This line

data = response.readlines()

could be

data = response.read().decode(response.headers.get_content_charset()).split('/n')

which should figure out the proper encoding, as per this answer

Python Write Json file from url, python 3 adding \n and b'

2 Answers2