The design
I'm trying to implement a RGB to YUV444 conversion algorithm in hardware based in the next approximation I've got working in a C based program:
#define CLIP(X) ( (X) > 255 ? 255 : (X) < 0 ? 0 : X)
#define RGB2Y(R, G, B) CLIP(( ( 66 * (R) + 129 * (G) + 25 * (B) + 128) >> 8) + 16)
#define RGB2U(R, G, B) CLIP(( ( -38 * (R) - 74 * (G) + 112 * (B) + 128) >> 8) + 128)
#define RGB2V(R, G, B) CLIP(( ( 112 * (R) - 94 * (G) - 18 * (B) + 128) >> 8) + 128)
Simulation/verification done so far
I simulated and verified the VHDL code using a self-checking approach. I compared the VHDL output against a 'golden' reference YUV444 values generated by the know working C algorithm using several images and all simulations run successfully.
The problem
When I implemented it in hardware and inserted within a video pipeline, the video output looks fine in terms of sync and there are not obvious issues in the frame rate, flickering...etc but the problem is that the colours are not right, there's a magenta/purple cast, e.g. yellow will look light magenta and red a bit darker...etc
I'm thinking that it is likely that the UV values are clipped/saturated and Y (luminance) works fine and this is what I'm seeing in the resulting video.
The code
Please note that for simplicity I only posted the part which is doing the conversion and the relevant signal declaration types and function. The rest of the code is just a axi video stream signal wrapper which works well in terms of video synchronisation and there are not video issues in that sense, let me know if you think it would be helpful and I'll post it too.
--Functions:
-- Absolute operation of: "op1 - op2 + op3" for the UV components
function uv_op_abs(op1 : unsigned(15 downto 0); op2 : unsigned(15 downto 0); op3 : unsigned(15 downto 0))
return unsigned is
variable res1 : unsigned(15 downto 0);
begin
if op2 > op1 then
res1 := (op2 - op1);
if res1 > op3 then
return res1 - op3;
else
return op3 - res1;
end if;
else
return (op1 - op2) + op3;
end if;
end uv_op_abs;
function clip(mult_in : unsigned(15 downto 0))
return unsigned is
begin
if to_integer(unsigned(mult_in)) > 240 then
return unsigned(to_unsigned(240,8));
else
return unsigned(mult_in(7 downto 0));
end if;
end clip;
-- Signals/Constants declarations:
--Constants
constant coeff_0 : unsigned(7 downto 0) := "01000010"; --66
constant coeff_1 : unsigned(7 downto 0) := "10000001"; --129
constant coeff_2 : unsigned(7 downto 0) := "00011001"; --25
constant coeff_3 : unsigned(7 downto 0) := "00100110"; --38
constant coeff_4 : unsigned(7 downto 0) := "01001010"; --74
constant coeff_5 : unsigned(7 downto 0) := "01110000"; --112
constant coeff_6 : unsigned(7 downto 0) := "01011110"; --94
constant coeff_7 : unsigned(7 downto 0) := "00010010"; --18
constant coeff_8 : unsigned(7 downto 0) := "10000000"; --128
constant coeff_9 : unsigned(7 downto 0) := "00010000"; --16
--Pipeline registers
signal red_reg : unsigned(7 downto 0);
signal green_reg : unsigned(7 downto 0);
signal blue_reg : unsigned(7 downto 0);
signal y_red_reg_op1 : unsigned(15 downto 0);
signal y_green_reg_op1 : unsigned(15 downto 0);
signal y_blue_reg_op1 : unsigned(15 downto 0);
signal u_red_reg_op1 : unsigned(15 downto 0);
signal u_green_reg_op1 : unsigned(15 downto 0);
signal u_blue_reg_op1 : unsigned(15 downto 0);
signal v_red_reg_op1 : unsigned(15 downto 0);
signal v_green_reg_op1 : unsigned(15 downto 0);
signal v_blue_reg_op1 : unsigned(15 downto 0);
signal y_reg_op2 : unsigned(15 downto 0);
signal u_reg_op2 : unsigned(15 downto 0);
signal v_reg_op2 : unsigned(15 downto 0);
signal y_reg_op3 : unsigned(7 downto 0);
signal u_reg_op3 : unsigned(7 downto 0);
signal v_reg_op3 : unsigned(7 downto 0);
-- The process for YUV444 conversion:
RGB_YUV_PROC : process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
red_reg <= (others => '0');
green_reg <= (others => '0');
blue_reg <= (others => '0');
y_red_reg_op1 <= (others => '0');
y_green_reg_op1 <= (others => '0');
y_blue_reg_op1 <= (others => '0');
u_red_reg_op1 <= (others => '0');
u_green_reg_op1 <= (others => '0');
u_blue_reg_op1 <= (others => '0');
v_red_reg_op1 <= (others => '0');
v_green_reg_op1 <= (others => '0');
v_blue_reg_op1 <= (others => '0');
y_reg_op2 <= (others => '0');
u_reg_op2 <= (others => '0');
v_reg_op2 <= (others => '0');
y_reg_op3 <= (others => '0');
u_reg_op3 <= (others => '0');
v_reg_op3 <= (others => '0');
yuv444_out <= (others => '0');
soff_sync <= '0';
else
--Sync with first video frame with the tuser (sof) input signal
if rgb_sof_in = '1' then
soff_sync <= '1';
end if;
--Fetch a pixel
if (rgb_sof_in = '1' or soff_sync = '1') and rgb_valid_in = '1' and yuv444_ready_out = '1' and bypass = '0' then
green_reg <= unsigned(rgb_in(7 downto 0));
blue_reg <= unsigned(rgb_in(15 downto 8));
red_reg <= unsigned(rgb_in(23 downto 16));
end if;
-- RGB to YUV conversion
-- Y--> CLIP(( ( 66 * (R) + 129 * (G) + 25 * (B) + 128) >> 8) + 16)
-- U--> CLIP(( ( -38 * (R) - 74 * (G) + 112 * (B) + 128) >> 8) + 128)
-- V--> CLIP(( ( 112 * (R) - 94 * (G) - 18 * (B) + 128) >> 8) + 128)
if (rgb_sof_in = '1' or soff_sync = '1') and (valid_delay = '1' or validff1 = '1') and yuv444_ready_out = '1' and bypass = '0' then
--Y calc ( 66 * (R) + 129 * (G) + 25 * (B) + 128) >> 8) + 16)
y_red_reg_op1 <= coeff_0 * red_reg;
y_green_reg_op1 <= coeff_1 * green_reg;
y_blue_reg_op1 <= coeff_2 * blue_reg;
y_reg_op2 <= y_red_reg_op1 + y_green_reg_op1 + y_blue_reg_op1 + (X"00" & coeff_8);
y_reg_op3 <= (y_reg_op2(15 downto 8) + coeff_9);
--U calc ( -38 * (R) - 74 * (G) + 112 * (B) + 128) >> 8) + 128)
u_red_reg_op1 <= coeff_3 * red_reg;
u_green_reg_op1 <= coeff_4 * green_reg;
u_blue_reg_op1 <= coeff_5 * blue_reg;
u_reg_op2 <= uv_op_abs(u_blue_reg_op1, (u_red_reg_op1 + u_green_reg_op1), (X"00" & coeff_8));
u_reg_op3 <= (u_reg_op2(15 downto 8) + coeff_8);
--V calc ( 112 * (R) - 94 * (G) - 18 * (B) + 128) >> 8) + 128)
v_red_reg_op1 <= coeff_5 * red_reg;
v_green_reg_op1 <= coeff_6 * green_reg;
v_blue_reg_op1 <= coeff_7 * blue_reg;
v_reg_op2 <= uv_op_abs(v_red_reg_op1, (v_blue_reg_op1 + v_green_reg_op1), (X"00" & coeff_8));
v_reg_op3 <= (v_reg_op2(15 downto 8) + coeff_8);
--Output data
yuv444_out <= std_logic_vector(v_reg_op3) & std_logic_vector(u_reg_op3) & std_logic_vector(y_reg_op3);
elsif yuv444_ready_out = '1' and rgb_valid_in = '1' and bypass = '1' then
yuv444_out <= rgb_in;
end if;
end if;
end if;
end process; -- RGB_YUV_PROC
I've also tried adding the 'clip; function to control overflow thinking that it'll help to the the synthesis tool in case of 'clipping' but it didn't help, the issue persists:
if (rgb_sof_in = '1' or soff_sync = '1') and (valid_delay = '1' or validff1 = '1') and yuv444_ready_out = '1' and bypass = '0' then
--Y calc ( 66 * (R) + 129 * (G) + 25 * (B) + 128) >> 8) + 16)
y_red_reg_op1 <= coeff_0 * red_reg;
y_green_reg_op1 <= coeff_1 * green_reg;
y_blue_reg_op1 <= coeff_2 * blue_reg;
y_reg_op2 <= y_red_reg_op1 + y_green_reg_op1 + y_blue_reg_op1 + (X"00" & coeff_8);
y_reg_op3 <= clip( X"00" & (y_reg_op2(15 downto 8) + coeff_9));
--U calc ( -38 * (R) - 74 * (G) + 112 * (B) + 128) >> 8) + 128)
u_red_reg_op1 <= coeff_3 * red_reg;
u_green_reg_op1 <= coeff_4 * green_reg;
u_blue_reg_op1 <= coeff_5 * blue_reg;
u_reg_op2 <= uv_op_abs(u_blue_reg_op1, (u_red_reg_op1 + u_green_reg_op1), (X"00" & coeff_8));
u_reg_op3 <= clip( X"00" & (u_reg_op2(15 downto 8) + coeff_8));
--V calc ( 112 * (R) - 94 * (G) - 18 * (B) + 128) >> 8) + 128)
v_red_reg_op1 <= coeff_5 * red_reg;
v_green_reg_op1 <= coeff_6 * green_reg;
v_blue_reg_op1 <= coeff_7 * blue_reg;
v_reg_op2 <= uv_op_abs(v_red_reg_op1, (v_blue_reg_op1 + v_green_reg_op1), (X"00" & coeff_8));
v_reg_op3 <= clip( X"00"& (v_reg_op2(15 downto 8) + coeff_8));
Question
I'm aware that in hardware design a successful simulation doesn't necessarily mean that the the design is going to work in the same way after synthesis and I'm sure there are lots of things can be improved in the code. There's something fundamentally wrong in this approach but I can't see it so far, does anyone have an idea what could be wrong and why?