16

How to identify doc, docx, pdf, xls and xlsx based on file header in C#? I don't want to rely on the file extensions neither MimeMapping.GetMimeMapping for this as either of the two can be manipulated.

I know how to read the header but dont know what combination of bytes can say if a file is a doc, docx, pdf, xls or xlsx. Any thoughts?

Bablu Dutt
  • 299
  • 1
  • 5
  • 16
  • *I know how to read the header* - if you know that for all those formats, then you already able to distinguish between them. If not, then this is exactly how you do: read specification of each format, build something able to recognize each type individually, combine them into one solution. – Sinatr Mar 23 '15 at 14:23
  • Check out this post: http://stackoverflow.com/questions/58510/using-net-how-can-you-find-the-mime-type-of-a-file-based-on-the-file-signature, I will post the relevant section below in the Answer section – Alex Mar 23 '15 at 14:30
  • 4
    amazingly arrogant reply from Sinatr – lekso Aug 06 '15 at 13:55

4 Answers4

11

This question contains a example of using the first bytes of a file to determine the file type: Using .NET, how can you find the mime type of a file based on the file signature not the extension

It is a very long post, so I am posting the relevant answer below:

public class MimeType
{
    private static readonly byte[] BMP = { 66, 77 };
    private static readonly byte[] DOC = { 208, 207, 17, 224, 161, 177, 26, 225 };
    private static readonly byte[] EXE_DLL = { 77, 90 };
    private static readonly byte[] GIF = { 71, 73, 70, 56 };
    private static readonly byte[] ICO = { 0, 0, 1, 0 };
    private static readonly byte[] JPG = { 255, 216, 255 };
    private static readonly byte[] MP3 = { 255, 251, 48 };
    private static readonly byte[] OGG = { 79, 103, 103, 83, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0 };
    private static readonly byte[] PDF = { 37, 80, 68, 70, 45, 49, 46 };
    private static readonly byte[] PNG = { 137, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82 };
    private static readonly byte[] RAR = { 82, 97, 114, 33, 26, 7, 0 };
    private static readonly byte[] SWF = { 70, 87, 83 };
    private static readonly byte[] TIFF = { 73, 73, 42, 0 };
    private static readonly byte[] TORRENT = { 100, 56, 58, 97, 110, 110, 111, 117, 110, 99, 101 };
    private static readonly byte[] TTF = { 0, 1, 0, 0, 0 };
    private static readonly byte[] WAV_AVI = { 82, 73, 70, 70 };
    private static readonly byte[] WMV_WMA = { 48, 38, 178, 117, 142, 102, 207, 17, 166, 217, 0, 170, 0, 98, 206, 108 };
    private static readonly byte[] ZIP_DOCX = { 80, 75, 3, 4 };

    public static string GetMimeType(byte[] file, string fileName)
    {

        string mime = "application/octet-stream"; //DEFAULT UNKNOWN MIME TYPE

        //Ensure that the filename isn't empty or null
        if (string.IsNullOrWhiteSpace(fileName))
        {
            return mime;
        }

        //Get the file extension
        string extension = Path.GetExtension(fileName) == null
                               ? string.Empty
                               : Path.GetExtension(fileName).ToUpper();

        //Get the MIME Type
        if (file.Take(2).SequenceEqual(BMP))
        {
            mime = "image/bmp";
        }
        else if (file.Take(8).SequenceEqual(DOC))
        {
            mime = "application/msword";
        }
        else if (file.Take(2).SequenceEqual(EXE_DLL))
        {
            mime = "application/x-msdownload"; //both use same mime type
        }
        else if (file.Take(4).SequenceEqual(GIF))
        {
            mime = "image/gif";
        }
        else if (file.Take(4).SequenceEqual(ICO))
        {
            mime = "image/x-icon";
        }
        else if (file.Take(3).SequenceEqual(JPG))
        {
            mime = "image/jpeg";
        }
        else if (file.Take(3).SequenceEqual(MP3))
        {
            mime = "audio/mpeg";
        }
        else if (file.Take(14).SequenceEqual(OGG))
        {
            if (extension == ".OGX")
            {
                mime = "application/ogg";
            }
            else if (extension == ".OGA")
            {
                mime = "audio/ogg";
            }
            else
            {
                mime = "video/ogg";
            }
        }
        else if (file.Take(7).SequenceEqual(PDF))
        {
            mime = "application/pdf";
        }
        else if (file.Take(16).SequenceEqual(PNG))
        {
            mime = "image/png";
        }
        else if (file.Take(7).SequenceEqual(RAR))
        {
            mime = "application/x-rar-compressed";
        }
        else if (file.Take(3).SequenceEqual(SWF))
        {
            mime = "application/x-shockwave-flash";
        }
        else if (file.Take(4).SequenceEqual(TIFF))
        {
            mime = "image/tiff";
        }
        else if (file.Take(11).SequenceEqual(TORRENT))
        {
            mime = "application/x-bittorrent";
        }
        else if (file.Take(5).SequenceEqual(TTF))
        {
            mime = "application/x-font-ttf";
        }
        else if (file.Take(4).SequenceEqual(WAV_AVI))
        {
            mime = extension == ".AVI" ? "video/x-msvideo" : "audio/x-wav";
        }
        else if (file.Take(16).SequenceEqual(WMV_WMA))
        {
            mime = extension == ".WMA" ? "audio/x-ms-wma" : "video/x-ms-wmv";
        }
        else if (file.Take(4).SequenceEqual(ZIP_DOCX))
        {
            mime = extension == ".DOCX" ? "application/vnd.openxmlformats-officedocument.wordprocessingml.document" : "application/x-zip-compressed";
        }

        return mime;
    }
Community
  • 1
  • 1
Alex
  • 19,061
  • 10
  • 50
  • 66
  • 1
    Overall pretty good, but couldn't this be tricked by renaming a file (such as JavaFile.jar or ExcelFile.xlsx) to a different file extension (like JavaFile.docx or ExcelFile.docx)? Perhaps that's out of scope of this question, but the OP didn't want to rely on file extension (presumably at all). Is there any better detection method for the file types based on ZIP? – Tophandour Nov 17 '15 at 15:44
  • I will investigate, it requires a second step checking the files after decompression. – Alex Nov 17 '15 at 15:50
  • Actually, I just did some tests with some reputable and widely used pieces of software and it doesn't seem like checking a file in this much detail (referring to my previous comment) is very common. I suppose if you had to, you could open the file as if it were a zip and browse its directories and check directory names and filetypes inside of it to make sure that everything is what you expect. Then again, I'm sure a dedicated and knowledgeable person could still craft something to trick whatever solution you come up with. Eh, diminishing returns, I suppose. – Tophandour Nov 18 '15 at 21:05
  • as stated this does not answer the question, since using this with doc, xls or ppt files return the same mime type, also docx, xlsx and ppts return the same – Juan Zamudio May 28 '19 at 00:28
  • Hello! What is the byte sequence of the DOC type in Java? Thanks. – tom Jan 13 '20 at 12:04
8

Using file signatures it is not so feasible (since the new office formats are ZIP files and the old Office files are OLE CF / OLE SS containers), but you can use C# code to read them and figure out what they are.

For newest Office formats, you can read the (DOCX/PPTX/XLSX/...) ZIP file using System.IO.Packaging : https://msdn.microsoft.com/en-us/library/ms568187(v=vs.110).aspx Doing that, you can find the ContentType of the first document part and infer using that.

For older Office files (Office 2003) you can use this library to distinguish them based on their contents (note that MSI and MSG files are also using this file format): http://sourceforge.net/projects/openmcdf/

E.g., here are the contents of an XLS file: XLS file internals

I hope this helps! :)

It would have certainly helped me, if I had found this answer earlier. ;)

user2173353
  • 3,852
  • 4
  • 39
  • 64
4

The answer from user2173353 is the most correct one, given that the OP specifically mentioned Office file formats. However, I didn't like the idea of adding an entire library (OpenMCDF) just to identify legacy Office formats, so I wrote my own routine for doing just this.

    public static CfbFileFormat GetCfbFileFormat(Stream fileData)
    {
        if (!fileData.CanSeek)
            throw new ArgumentException("Data stream must be seekable.", nameof(fileData));

        try
        {
            // Notice that values in a CFB files are always little-endian. Fortunately BinaryReader.ReadUInt16/ReadUInt32 reads with little-endian.
            // If using .net < 4.5 this BinaryReader constructor is not available. Use a simpler one but remember to also remove the 'using' statement.
            using (BinaryReader reader = new BinaryReader(fileData, Encoding.Unicode, true))
            {
                // Check that data has the CFB file header
                var header = reader.ReadBytes(8);
                if (!header.SequenceEqual(new byte[] {0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1}))
                    return CfbFileFormat.Unknown;

                // Get sector size (2 byte uint) at offset 30 in the header
                // Value at 1C specifies this as the power of two. The only valid values are 9 or 12, which gives 512 or 4096 byte sector size.
                fileData.Position = 30;
                ushort readUInt16 = reader.ReadUInt16();
                int sectorSize = 1 << readUInt16;

                // Get first directory sector index at offset 48 in the header
                fileData.Position = 48;
                var rootDirectoryIndex = reader.ReadUInt32();

                // File header is one sector wide. After that we can address the sector directly using the sector index
                var rootDirectoryAddress = sectorSize + (rootDirectoryIndex * sectorSize);

                // Object type field is offset 80 bytes into the directory sector. It is a 128 bit GUID, encoded as "DWORD, WORD, WORD, BYTE[8]".
                fileData.Position = rootDirectoryAddress + 80;
                var bits127_96 = reader.ReadInt32();
                var bits95_80 = reader.ReadInt16();
                var bits79_64 = reader.ReadInt16();
                var bits63_0 = reader.ReadBytes(8);

                var guid = new Guid(bits127_96, bits95_80, bits79_64, bits63_0);

                // Compare to known file format GUIDs

                CfbFileFormat result;
                return Formats.TryGetValue(guid, out result) ? result : CfbFileFormat.Unknown;
            }
        }
        catch (IOException)
        {
            return CfbFileFormat.Unknown;
        }
        catch (OverflowException)
        {
            return CfbFileFormat.Unknown;
        }
    }

    public enum CfbFileFormat
    {
        Doc,
        Xls,
        Msi,
        Ppt,
        Unknown
    }

    private static readonly Dictionary<Guid, CfbFileFormat> Formats = new Dictionary<Guid, CfbFileFormat>
    {
        {Guid.Parse("{00020810-0000-0000-c000-000000000046}"), CfbFileFormat.Xls},
        {Guid.Parse("{00020820-0000-0000-c000-000000000046}"), CfbFileFormat.Xls},
        {Guid.Parse("{00020906-0000-0000-c000-000000000046}"), CfbFileFormat.Doc},
        {Guid.Parse("{000c1084-0000-0000-c000-000000000046}"), CfbFileFormat.Msi},
        {Guid.Parse("{64818d10-4f9b-11cf-86ea-00aa00b929e8}"), CfbFileFormat.Ppt}
    };

Additional formats identifiers can be added as needed.

I've tried this on .doc and .xls, and it has worked fine. I haven't tested on CFB files using 4096 byte sector size, as I don't even know where to find those.

The code is based on information from the following documents:

rymdsmurf
  • 625
  • 5
  • 11
1

user2173353 has what appears to be the correct solution for detecting the new Office .docx / .xlsx formats. To add some details to this, the below check appears to identify these correctly:

    /// <summary>
    /// MS .docx, .xslx and other extensions are (correctly) identified as zip files using signature lookup.
    /// This tests if System.IO.Packaging is able to open, and if package has parts, this is not a zip file.
    /// </summary>
    /// <param name="stream"></param>
    /// <returns></returns>
    private static bool IsPackage(this Stream stream)
    {
        Package package = Package.Open(stream, FileMode.Open, FileAccess.Read);
        return package.GetParts().Any();
    }
user369142
  • 1,635
  • 15
  • 9