19

I noticed that sometimes even if I don't use iostream and related I/O libraries, my binaries produced by Mingw were still unreasonably large.

For example, I wrote a code to use vector and cstdio only and compiled it with -O2 -flto, my program can go as large as 2MB! I run nm main.exe > e.txt and was shocked to see all the iostream related functions in it.

After some googling, I learnt to use -ffunction-sections -Wl,-gc-sections, that reduces the program size from 2MB to ~300KB (if with -s, 100+KB). Excellent!

To further test the effect of -ffunction-sections -Wl,-gc-sections, here is another code:

#include <cstdio>
#include <vector>
#include <tuple>
#include <algorithm>
#include <chrono>
#include <windows.h>

#undef min

struct Point {
    int x, y;
};

constexpr int length = 5;
constexpr int half_length() {
    return length & 1 ? length : length - 1;
}

template<class F>
int func_template(F&& f) {
#ifdef _MSC_VER
    puts(__FUNCSIG__);
#else
    puts(__PRETTY_FUNCTION__);
#endif
    printf("\n");
    return f();
}

struct fake_func {
    int operator()() const { return 59; };
};

template<class F, class... Args>
int pass_args(F&& f, Args&&... args) {
#ifdef _MSC_VER
    puts(__FUNCSIG__);
#else
    puts(__PRETTY_FUNCTION__);
#endif
    printf("\n");
    return f(std::forward<Args>(args)...);
}

template<class T>
T min(T x) {
    return x;
}

template<class T, class... Args>
T min(T x, Args... args) {
    T y = min(args...);
    return x < y ? x : y;
}

void type_verifier(int x) {
    printf("%dd ", x);
}

void type_verifier(char x) {
    printf("'%c' ", x);
}

void type_verifier(double x) {
    printf("%lff ", x);
}

template<class T>
void type_verifier(T x) {
    printf("unknown ");
}

template<class T, class... Args>
void type_verifier(T x, Args... args) {
    type_verifier(x);
    type_verifier(args...);
}

int bufLen;
char buf[100];

template<class... Args>
inline int send(Args... args) {
    bufLen = sprintf(buf, std::forward<Args>(args)...);
    return bufLen;
}

namespace std {

inline namespace v1 {
    void func() {
        printf("I am v1\n");
    }
}

namespace v2 {
    void func() {
        printf("I am v2\n");
    }
}

}

int main() {
    std::vector<int> v {1, 2, 3, 4, 5};
    for (auto &i : v) printf("%d ", i);
    printf("\n");

    Point p {1, 2};
    printf("%d %d\n", p.x, p.y);

    auto t = std::make_tuple("Hello World", 12);
    printf("%s %d\n", std::get<0>(t), std::get<1>(t));
    int a, b;
    auto f = []() { return std::make_tuple(1, 2); };
    std::tie(a, b) = f();
    printf("%d %d\n", a, b);

    //int test_constexpr[half_length() + 4];

    int ft = func_template([]{ return 42; });
    printf("func_template: %d\n", ft);
    ft = func_template(fake_func {});
    printf("func_template: %d\n", ft);
    ft = pass_args([](int x, int y) { return x + y; }, 152, 58);
    printf("pass_args: %d\n", ft);
    ft = pass_args([](int n, const char *m) {
        for (int i = 0; i < n; i++) printf("%c ", m[i]);
        printf("\n");
        return 0;
    }, 5, "Hello");

    printf("min: %d\n", min(3, 4, 2, 1, 5));
    type_verifier(12, 'A', 0.5, "Hello");
    printf("\n");

/*  send("Hello World");
    send("%d", 1);
    send("%d", "1234");
    sprintf(buf, "%d", "123");*/

    std::func();
    std::v1::func();
    std::v2::func();

    std::rotate(v.begin(), v.begin() + 2, v.end());
    for (auto &i : v) printf("%d ", i);
    printf("\n");

    auto start = std::chrono::steady_clock::now();

    std::vector<int> x {2, 4, 2, 0, 5, 10, 7, 3, 7, 1}; 
    printf("insertion sort: ");
    for (auto &i: x) printf("%d ", i);
    printf("\n");
    // insertion sort
    for (auto i = x.begin(); i != x.end(); ++i) {
        std::rotate(std::upper_bound(x.begin(), i, *i), i, i+1);
        for (auto &j: x) printf("%d ", j);
        printf("\n");
    }

    std::vector<int> heap {7, 5, 3, 4, 2};
    std::make_heap(heap.begin(), heap.end());
    std::pop_heap(heap.begin(), heap.end());
    printf("Pop heap (%d)\n", heap.back());
    heap.pop_back();
    heap.push_back(1);
    std::push_heap(heap.begin(), heap.end());
    std::sort_heap(heap.begin(), heap.end());
    for (auto &i: heap) printf("%d ", i);
    printf("\n");

    auto end = std::chrono::steady_clock::now();
    auto diff = end - start;
    printf("time: %I64d ms\n",
        std::chrono::duration_cast<std::chrono::milliseconds>(diff).count());

    {
        auto u = v;
        std::move_backward(u.begin(), u.begin() + u.size() - 1, u.begin() + u.size());
        for (auto &i : u) printf("%d ", i);
        printf("\n");
    }

    {
        auto u = v;
        std::move(u.begin() + 1, u.begin() + u.size(), u.begin());
        for (auto &i : u) printf("%d ", i);
        printf("\n");
    }

    start = std::chrono::steady_clock::now();
    Sleep(2000);
    end = std::chrono::steady_clock::now();
    diff = end - start;
    printf("time: %I64d ms\n",
        std::chrono::duration_cast<std::chrono::milliseconds>(diff).count());

    std::chrono::steady_clock::time_point before;
    before = std::chrono::steady_clock::now();
    Sleep(2000);
    auto after = std::chrono::steady_clock::now();
    printf("%f seconds\n", std::chrono::duration<double>(after - before).count());

    return 0;
}

To my disappointment, the final program is once again > 2MB.

Interestingly, cl.exe thoughtfully remove all iostream related functions consistently even if I didn't use /O2 or any other flags, just cl.exe main.cpp. (For the code above, cl.exe produces 100+KB binary).

Did I miss any other useful gcc flags for this?

Specification:

  • Mingw-w64 gcc 6.1.0
  • Mingw-w64 gcc 6.2.0
  • Visual Studio 2017 RC
  • All binaries are linked statically

Compare with Linux

I compared the binaries produced by gcc 4.9.2 (Linux) and gcc 4.9.3 (mingw-w64) for the above code (except windows.h and Sleep were removed).

Compile flag

g++ -o c++11 c++11.cpp -std=c++11 -static-libgcc -static-libstdc++ -ffunction-sections -Wl,-gc-sections -O2

Linux gcc did successfully strip away iostream and functions without the need for -flto while Mingw-w64 gcc just can't do it properly.

Windows only support PE format while Linux supports ELF format, allowing Linux to use Gold linker. Maybe this is the explanation?

Update

I eventually filed a bug at https://sourceforge.net/p/mingw-w64/bugs/578/ . Let's hope it gets some attentions!

John London
  • 1,164
  • 13
  • 24
  • This might help: [How to remove unused C/C++ symbols with GCC and ld?](https://stackoverflow.com/questions/6687630/how-to-remove-unused-c-c-symbols-with-gcc-and-ld) – benbuck Dec 27 '16 at 11:25
  • Tried everything: -Os (reduce 2KB), -fwhole-program (no change), -fomit-frame-pointer (no change). -why_live is not available. – John London Dec 27 '16 at 14:22
  • `length & 1 ? length : length - 1` can be changed to `length + (length & 1) - 1` – phuclv Dec 30 '16 at 05:16
  • For me `-fwhole-program` did it, with gcc 8.2.0 from MSYS2. – Michel Rouzic Aug 06 '18 at 16:01
  • See the bug report https://sourceware.org/bugzilla/show_bug.cgi?id=11539 with a working in progress patch (that was not updated since a couple of year now...) – benjarobin Oct 18 '19 at 10:29

1 Answers1

4

Try stripping debug and symbol info from static libstdc++ via -Wl,--strip-all. This reduced my executable from 9M to 670K on Cygwin (13x) and from 6M to 80K on Ubuntu (80x).

yugr
  • 13,457
  • 3
  • 37
  • 71
  • `-Wl,--strip-all` is the same as `-s` I mentioned above. Stripping debug info and symbol table does not strip the unused `iostream` related functions, which is something MSVC does without telling (no special flags required)! – John London Nov 30 '16 at 00:28
  • Ok, I see. FYI a 4.8.2 x86_64-w64-mingw32-g++ (Ubuntu 14.04) with `-std=c++11 -O2 -ffunction-sections -Wl,-gc-sections -static-libstdc++ -s` generates a 100K executable so the size issue may be specific to a particular toolchain version (and thus harder to investigate remotely). Have you tried analyzing linker map (`-Wl,--print-map`)? – yugr Nov 30 '16 at 06:05
  • Since I use the latest MSVC, I changed to gcc 6.2.0 to be fair. `-Wl,-print-gc-sections` shows that gcc does strip some `iostream` and `locale` functions, but `-Wl,--print-map` shows that there are still leftovers. MSVC: 150KB, gcc: 738KB. See http://pastebin.com/raw/4uGCm7Yy – John London Nov 30 '16 at 07:08
  • I think you attached gc.txt rather than map.txt (the dump containts output from -print-gc-sections). "gcc: 738KB" - I'm confused, originally you said it's over 2M... – yugr Nov 30 '16 at 17:49
  • FWIW experimental support for `--gc-sections` on PE/COFF targets (i.e. MinGW/Cygwin) has only [landed in Bintools 2.25](https://sourceware.org/bugzilla/show_bug.cgi?id=11539) in min-2015 so it's likely to be sub-optimal. – yugr Dec 01 '16 at 05:21
  • Sorry, 2MB was without `-s`. I am using binutils 2.27. – John London Dec 01 '16 at 09:11
  • You can also -fdata-sections in addition to what @yugr told, but I don't know on MinGW it's general in GCC. What I'm wondering why there is nobody who already implemented an : -frtti-without-strings, because this is what most of the time costs alot in binaries. – daminetreg Dec 12 '16 at 06:54
  • @daminetreg I've tried `-fdata-sections` for John's code but it didn't change anything so I didn't bother to mention it. I've also tried linking with `-fno-exceptions` and `-fno-rtti` but it didn't help either. – yugr Dec 12 '16 at 07:19
  • @daminetreg I added my finding on Linux's gcc in my question, please take a look. – John London Dec 19 '16 at 13:16