Index: gcc/defaults.h
===================================================================
--- gcc/defaults.h	(revision 306416)
+++ gcc/defaults.h	(working copy)
@@ -886,6 +886,14 @@ see the files COPYING3 and COPYING.RUNTI
 #define FLOAT_WORDS_BIG_ENDIAN WORDS_BIG_ENDIAN
 #endif
 
+/* Similarly if VECTOR_ELEMENTS_BIG_ENDIAN is not defined in the header
+   files, then the element ordering is analogous to the integer word
+   endianness (element 0 from least memory address in LE mode, and from
+   greatest memory address in BE mode).  */
+#ifndef VECTOR_ELEMENTS_BIG_ENDIAN
+#define VECTOR_ELEMENTS_BIG_ENDIAN BYTES_BIG_ENDIAN
+#endif
+
 #ifdef TARGET_FLT_EVAL_METHOD
 #define TARGET_FLT_EVAL_METHOD_NON_DEFAULT 1
 #else
Index: gcc/tree-vect-data-refs.c
===================================================================
--- gcc/tree-vect-data-refs.c	(revision 306416)
+++ gcc/tree-vect-data-refs.c	(working copy)
@@ -2808,7 +2808,7 @@ vect_permute_store_chain (VEC(tree,heap)
 	  perm_dest = create_tmp_var (vectype, "vect_inter_high");
 	  DECL_GIMPLE_REG_P (perm_dest) = 1;
 	  add_referenced_var (perm_dest);
-          if (BYTES_BIG_ENDIAN)
+          if (VECTOR_ELEMENTS_BIG_ENDIAN)
 	    {
 	      high_code = VEC_INTERLEAVE_HIGH_EXPR;
 	      low_code = VEC_INTERLEAVE_LOW_EXPR;
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	(revision 306416)
+++ gcc/tree-vect-stmts.c	(working copy)
@@ -4699,7 +4699,7 @@ supportable_widening_operation (enum tre
   switch (code)
     {
     case WIDEN_MULT_EXPR:
-      if (BYTES_BIG_ENDIAN)
+      if (VECTOR_ELEMENTS_BIG_ENDIAN)
         {
           c1 = VEC_WIDEN_MULT_HI_EXPR;
           c2 = VEC_WIDEN_MULT_LO_EXPR;
@@ -4712,7 +4712,7 @@ supportable_widening_operation (enum tre
       break;
 
     CASE_CONVERT:
-      if (BYTES_BIG_ENDIAN)
+      if (VECTOR_ELEMENTS_BIG_ENDIAN)
         {
           c1 = VEC_UNPACK_HI_EXPR;
           c2 = VEC_UNPACK_LO_EXPR;
@@ -4725,7 +4725,7 @@ supportable_widening_operation (enum tre
       break;
 
     case FLOAT_EXPR:
-      if (BYTES_BIG_ENDIAN)
+      if (VECTOR_ELEMENTS_BIG_ENDIAN)
         {
           c1 = VEC_UNPACK_FLOAT_HI_EXPR;
           c2 = VEC_UNPACK_FLOAT_LO_EXPR;
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 306416)
+++ gcc/config/arm/arm.c	(working copy)
@@ -17618,6 +17618,30 @@ arm_regno_class (int regno)
   return FPA_REGS;
 }
 
+/* FPA registers can't do subreg as all values are reformatted to internal
+   precision.  VFP registers may only be accessed in the mode they
+   were set unless they contain NEON vector values.  */
+
+bool
+arm_can_change_mode_class (enum machine_mode from, enum machine_mode to,
+			   enum mode_class class)
+{
+  if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to))
+    return true;
+  
+  if (reg_classes_intersect_p (FPA_REGS, class))
+    return false;
+
+  if ((VALID_NEON_DREG_MODE (from) || VALID_NEON_QREG_MODE (from))
+      && (VALID_NEON_DREG_MODE (to) || VALID_NEON_QREG_MODE (to)))
+    return true;
+  
+  if (reg_classes_intersect_p (VFP_REGS, class))
+    return false;
+  
+  return true;
+}
+
 /* Handle a special case when computing the offset
    of an argument from the frame pointer.  */
 int
Index: gcc/config/arm/arm.h
===================================================================
--- gcc/config/arm/arm.h	(revision 306416)
+++ gcc/config/arm/arm.h	(working copy)
@@ -592,6 +592,9 @@ extern int low_irq_latency;
    VFP-format or some other floating point co-processor's format doubles.  */
 #define FLOAT_WORDS_BIG_ENDIAN (arm_float_words_big_endian ())
 
+/* NEON vectors are never purely big-endian (at least for Q registers).  */
+#define VECTOR_ELEMENTS_BIG_ENDIAN 0
+
 #define UNITS_PER_WORD	4
 
 /* Use the option -mvectorize-with-neon-quad to override the use of doubleword
@@ -1300,14 +1303,8 @@ enum reg_class
   LIM_REG_CLASSES							     \
 }
 
-/* FPA registers can't do subreg as all values are reformatted to internal
-   precision.  VFP registers may only be accessed in the mode they
-   were set.  */
-#define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS)	\
-  (GET_MODE_SIZE (FROM) != GET_MODE_SIZE (TO)		\
-   ? reg_classes_intersect_p (FPA_REGS, (CLASS))	\
-     || reg_classes_intersect_p (VFP_REGS, (CLASS))	\
-   : 0)
+#define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS)			 \
+  (!arm_can_change_mode_class ((FROM), (TO), (CLASS)))
 
 /* We need to define this for LO_REGS on Thumb-1.  Otherwise we can end up
    using r0-r4 for function arguments, r7 for the stack frame and don't have
Index: gcc/config/arm/arm-protos.h
===================================================================
--- gcc/config/arm/arm-protos.h	(revision 306416)
+++ gcc/config/arm/arm-protos.h	(working copy)
@@ -27,6 +27,10 @@ extern void arm_override_options (void);
 extern void arm_optimization_options (int, int);
 extern int use_return_insn (int, rtx);
 extern enum reg_class arm_regno_class (int);
+#ifdef HAVE_MACHINE_MODES
+extern bool arm_can_change_mode_class (enum machine_mode, enum machine_mode,
+				       enum mode_class);
+#endif
 extern void arm_load_pic_register (unsigned long);
 extern int arm_volatile_func (void);
 extern const char *arm_output_epilogue (rtx);
Index: gcc/config/arm/neon.md
===================================================================
--- gcc/config/arm/neon.md	(revision 306416)
+++ gcc/config/arm/neon.md	(working copy)
@@ -398,6 +398,11 @@
 ;; A list of widening operators
 (define_code_iterator SE [sign_extend zero_extend])
 
+;; Numeric codes for opcode features for sign/zero-extend ops (see 'T' in
+;; arm_print_operand).
+
+(define_code_attr SE_magic [(sign_extend "1") (zero_extend "0")])
+
 ;; Assembler mnemonics for above codes.
 (define_code_attr VQH_mnem [(plus "vadd") (smin "vmin") (smax "vmax")
 			    (umin "vmin") (umax "vmax")])
@@ -811,12 +816,6 @@
           (parallel [(match_operand:SI 2 "immediate_operand" "i")])))]
   "TARGET_NEON"
 {
-  if (BYTES_BIG_ENDIAN)
-    {
-      int elt = INTVAL (operands[2]);
-      elt = GET_MODE_NUNITS (<MODE>mode) - 1 - elt;
-      operands[2] = GEN_INT (elt);
-    }
   return "vmov%?.<V_uf_sclr>\t%0, %P1[%c2]";
 }
   [(set_attr "predicable" "yes")
@@ -836,7 +835,7 @@
   int regno = REGNO (operands[1]);
 
   if (BYTES_BIG_ENDIAN)
-    elt = half_elts - 1 - elt;
+    hi = 2 - hi;
 
   operands[1] = gen_rtx_REG (<V_HALF>mode, regno + hi);
   operands[2] = GEN_INT (elt);
@@ -854,7 +853,12 @@
           (parallel [(match_operand:SI 2 "immediate_operand" "i")])))]
   "TARGET_NEON"
 {
-  int regno = REGNO (operands[1]) + 2 * INTVAL (operands[2]);
+  int regno = REGNO (operands[1]);
+  
+  if (BYTES_BIG_ENDIAN)
+    regno += 2 * INTVAL (operands[2]);
+  else
+    regno += 2 - 2 * INTVAL (operands[2]);
 
   operands[1] = gen_rtx_REG (DImode, regno);
 
@@ -1350,11 +1354,14 @@
 ;; shift-count granularity. That's good enough for the middle-end's current
 ;; needs.
 
+;; Note that it's not safe to perform such an operation in big-endian mode,
+;; due to element-ordering issues.
+
 (define_expand "vec_shr_<mode>"
   [(match_operand:VDQ 0 "s_register_operand" "")
    (match_operand:VDQ 1 "s_register_operand" "")
    (match_operand:SI 2 "const_multiple_of_8_operand" "")]
-  "TARGET_NEON"
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
 {
   rtx zero_reg;
   HOST_WIDE_INT num_bits = INTVAL (operands[2]);
@@ -1382,7 +1389,7 @@
   [(match_operand:VDQ 0 "s_register_operand" "")
    (match_operand:VDQ 1 "s_register_operand" "")
    (match_operand:SI 2 "const_multiple_of_8_operand" "")]
-  "TARGET_NEON"
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
 {
   rtx zero_reg;
   HOST_WIDE_INT num_bits = INTVAL (operands[2]);
@@ -1482,67 +1489,13 @@
                     (const_string "neon_int_1") (const_string "neon_int_5")))]
 )
 
-; FIXME: We wouldn't need the following insns if we could write subregs of
-; vector registers. Make an attempt at removing unnecessary moves, though
-; we're really at the mercy of the register allocator.
-
-(define_insn "neon_move_lo_quad_<mode>"
-  [(set (match_operand:ANY128 0 "s_register_operand" "=w")
-        (vec_concat:ANY128
-          (match_operand:<V_HALF> 1 "s_register_operand" "w")
-          (vec_select:<V_HALF>
-		(match_operand:ANY128 3 "s_register_operand" "0")
-	        (match_operand:ANY128 2 "vect_par_constant_high" ""))))]
-  "TARGET_NEON"
-{
-  int dest = REGNO (operands[0]);
-  int src = REGNO (operands[1]);
-
-  if (dest != src)
-    return "vmov\t%e0, %P1\t@ neon_move_lo_quad";
-  else
-    return "";
-}
-  [(set_attr "neon_type" "neon_bp_simple")]
-)
-
-(define_insn "neon_move_hi_quad_<mode>"
-  [(set (match_operand:ANY128 0 "s_register_operand" "=w")
-        (vec_concat:ANY128
-          (vec_select:<V_HALF>
-		(match_operand:ANY128 3 "s_register_operand" "0")
-	        (match_operand:ANY128 2 "vect_par_constant_low" ""))
-          (match_operand:<V_HALF> 1 "s_register_operand" "w")))]
-	   
-  "TARGET_NEON"
-{
-  int dest = REGNO (operands[0]);
-  int src = REGNO (operands[1]);
-
-  if (dest + 2 != src)
-    return "vmov\t%f0, %P1\t@ neon_move_hi_quad";
-  else
-    return "";
-}
-  [(set_attr "neon_type" "neon_bp_simple")]
-)
-
 (define_expand "move_hi_quad_<mode>"
  [(match_operand:ANY128 0 "s_register_operand" "")
   (match_operand:<V_HALF> 1 "s_register_operand" "")]
  "TARGET_NEON"
 {
-  rtvec v = rtvec_alloc (<V_mode_nunits>/2);
-  rtx t1;
-  int i;
-
-  for (i=0; i < (<V_mode_nunits>/2); i++)
-     RTVEC_ELT (v, i) = GEN_INT (i);
-
-  t1 = gen_rtx_PARALLEL (<MODE>mode, v);
-  emit_insn (gen_neon_move_hi_quad_<mode> (operands[0], operands[1], t1,
-					   operands[0]));
-
+  rtx desthi = simplify_gen_subreg (<V_HALF>mode, operands[0], <MODE>mode, 8);
+  emit_move_insn (desthi, operands[1]);
   DONE;
 })
 
@@ -1551,17 +1504,8 @@
   (match_operand:<V_HALF> 1 "s_register_operand" "")]
  "TARGET_NEON"
 {
-  rtvec v = rtvec_alloc (<V_mode_nunits>/2);
-  rtx t1;
-  int i;
-
-  for (i=0; i < (<V_mode_nunits>/2); i++)
-     RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
-
-  t1 = gen_rtx_PARALLEL (<MODE>mode, v);
-  emit_insn (gen_neon_move_lo_quad_<mode> (operands[0], operands[1], t1,
-					   operands[0]));
-
+  rtx destlo = simplify_gen_subreg (<V_HALF>mode, operands[0], <MODE>mode, 0);
+  emit_move_insn (destlo, operands[1]);
   DONE;
 })
 
@@ -5558,243 +5502,390 @@
 })
 
 (define_insn "neon_vec_unpack<US>_lo_<mode>"
-  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+  [(set (match_operand:<V_unpack> 0 "s_register_operand" "=w")
         (SE:<V_unpack> (vec_select:<V_HALF>
-			  (match_operand:VU 1 "register_operand" "w")
+			  (match_operand:VU 1 "s_register_operand" "w")
 			  (match_operand:VU 2 "vect_par_constant_low" ""))))]
   "TARGET_NEON"
-  "vmovl.<US><V_sz_elem> %q0, %e1"
+  "vmovl.<US><V_sz_elem>\t%q0, %e1"
   [(set_attr "neon_type" "neon_shift_1")]
 )
 
 (define_insn "neon_vec_unpack<US>_hi_<mode>"
-  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+  [(set (match_operand:<V_unpack> 0 "s_register_operand" "=w")
         (SE:<V_unpack> (vec_select:<V_HALF>
-			  (match_operand:VU 1 "register_operand" "w")
+			  (match_operand:VU 1 "s_register_operand" "w")
 			  (match_operand:VU 2 "vect_par_constant_high" ""))))]
   "TARGET_NEON"
-  "vmovl.<US><V_sz_elem> %q0, %f1"
+  "vmovl.<US><V_sz_elem>\t%q0, %f1"
   [(set_attr "neon_type" "neon_shift_1")]
 )
 
 (define_expand "vec_unpack<US>_hi_<mode>"
-  [(match_operand:<V_unpack> 0 "register_operand" "")
-   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
- "TARGET_NEON"
-  {
-   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
-   rtx t1;
-   int i;
-   for (i = 0; i < (<V_mode_nunits>/2); i++)
-     RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
-  
-   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
-   emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0], 
-                                                 operands[1], 
-					         t1));
-   DONE;
-  }
-)
+  [(match_operand:<V_unpack> 0 "s_register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "s_register_operand"))]
+  "TARGET_NEON"
+{
+  if (BYTES_BIG_ENDIAN)
+    {
+      rtx hi_quad = simplify_gen_subreg (<V_HALF>mode, operands[1],
+					 <MODE>mode, 8);
+      rtx tmp = gen_reg_rtx (<V_unpack>mode);
+      rtx tmp2 = gen_reg_rtx (V2DImode);
+      rtx tmp3 = gen_reg_rtx (V2DImode);
+
+      emit_insn (gen_neon_vmovl<V_half> (tmp, hi_quad, GEN_INT (<SE_magic>)));
+      neon_reinterpret (tmp2, tmp);
+      emit_insn (gen_neon_vextv2di (tmp3, tmp2, tmp2, GEN_INT (1)));
+      neon_reinterpret (operands[0], tmp3);
+    }
+  else
+    {
+      rtvec v = rtvec_alloc (<V_mode_nunits>/2);
+      rtx t1;
+      int i;
+
+      for (i = 0; i < (<V_mode_nunits> / 2); i++)
+	RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits> / 2) + i);
+
+      t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+
+      emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0], 
+                                                    operands[1], 
+					            t1));
+    }
+  DONE;
+})
 
 (define_expand "vec_unpack<US>_lo_<mode>"
-  [(match_operand:<V_unpack> 0 "register_operand" "")
-   (SE:<V_unpack> (match_operand:VU 1 "register_operand" ""))]
- "TARGET_NEON"
-  {
-   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
-   rtx t1;
-   int i;
-   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
-     RTVEC_ELT (v, i) = GEN_INT (i);
-   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
-   emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0], 
-                                                 operands[1], 
-				   	         t1));
-   DONE;
-  }
-)
+  [(match_operand:<V_unpack> 0 "s_register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "s_register_operand" ""))]
+  "TARGET_NEON"
+{
+  if (BYTES_BIG_ENDIAN)
+    {
+      rtx lo_quad = simplify_gen_subreg (<V_HALF>mode, operands[1],
+					 <MODE>mode, 0);
+      rtx tmp = gen_reg_rtx (<V_unpack>mode);
+      rtx tmp2 = gen_reg_rtx (V2DImode);
+      rtx tmp3 = gen_reg_rtx (V2DImode);
+
+      emit_insn (gen_neon_vmovl<V_half> (tmp, lo_quad, GEN_INT (<SE_magic>)));
+      neon_reinterpret (tmp2, tmp);
+      emit_insn (gen_neon_vextv2di (tmp3, tmp2, tmp2, GEN_INT (1)));
+      neon_reinterpret (operands[0], tmp3);
+    }
+  else
+    {
+      rtvec v = rtvec_alloc (<V_mode_nunits>/2);
+      rtx t1;
+      int i;
+
+      for (i = 0; i < (<V_mode_nunits> / 2); i++)
+	RTVEC_ELT (v, i) = GEN_INT (i);
+
+      t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+
+      emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0], 
+                                                    operands[1], 
+				   	            t1));
+    }
+  DONE;
+})
 
 (define_insn "neon_vec_<US>mult_lo_<mode>"
- [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
-       (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
-			   (match_operand:VU 1 "register_operand" "w") 
+  [(set (match_operand:<V_unpack> 0 "s_register_operand" "=w")
+	(mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
+			   (match_operand:VU 1 "s_register_operand" "w") 
                            (match_operand:VU 2 "vect_par_constant_low" "")))
- 		        (SE:<V_unpack> (vec_select:<V_HALF>
-                           (match_operand:VU 3 "register_operand" "w") 
-                           (match_dup 2)))))]
+			 (SE:<V_unpack> (vec_select:<V_HALF>
+			   (match_operand:VU 3 "s_register_operand" "w") 
+			   (match_dup 2)))))]
   "TARGET_NEON"
-  "vmull.<US><V_sz_elem> %q0, %e1, %e3"
+  "vmull.<US><V_sz_elem>\t%q0, %e1, %e3"
   [(set_attr "neon_type" "neon_shift_1")]
 )
 
 (define_expand "vec_widen_<US>mult_lo_<mode>"
-  [(match_operand:<V_unpack> 0 "register_operand" "")
-   (SE:<V_unpack> (match_operand:VU 1 "register_operand" ""))
-   (SE:<V_unpack> (match_operand:VU 2 "register_operand" ""))]
- "TARGET_NEON"
- {
-   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
-   rtx t1;
-   int i;
-   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
-     RTVEC_ELT (v, i) = GEN_INT (i);
-   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
-
-   emit_insn (gen_neon_vec_<US>mult_lo_<mode> (operands[0],
- 					       operands[1],
-					       t1,
-					       operands[2]));
-   DONE;
- }
-)
+  [(match_operand:<V_unpack> 0 "s_register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "s_register_operand" ""))
+   (SE:<V_unpack> (match_operand:VU 2 "s_register_operand" ""))]
+  "TARGET_NEON"
+{
+  if (BYTES_BIG_ENDIAN)
+    {
+      rtx lo_1 = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, 0);
+      rtx lo_2 = simplify_gen_subreg (<V_HALF>mode, operands[2], <MODE>mode, 0);
+      rtx tmp = gen_reg_rtx (<V_unpack>mode);
+      rtx tmp2 = gen_reg_rtx (V2DImode);
+      rtx tmp3 = gen_reg_rtx (V2DImode);
+
+      emit_insn (gen_neon_vmull<V_half> (tmp, lo_1, lo_2,
+					 GEN_INT (<SE_magic>)));
+      neon_reinterpret (tmp2, tmp);
+      emit_insn (gen_neon_vextv2di (tmp3, tmp2, tmp2, GEN_INT (1)));
+      neon_reinterpret (operands[0], tmp3);
+    }
+  else
+    {
+      rtvec v = rtvec_alloc (<V_mode_nunits> / 2);
+      rtx t1;
+      int i;
+
+      for (i = 0; i < (<V_mode_nunits> / 2); i++)
+	RTVEC_ELT (v, i) = GEN_INT (i);
+
+      t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+
+      emit_insn (gen_neon_vec_<US>mult_lo_<mode> (operands[0],
+ 						  operands[1],
+						  t1,
+						  operands[2]));
+    }
+  DONE;
+})
 
 (define_insn "neon_vec_<US>mult_hi_<mode>"
- [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
-      (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
-			    (match_operand:VU 1 "register_operand" "w") 
+  [(set (match_operand:<V_unpack> 0 "s_register_operand" "=w")
+	(mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
+			    (match_operand:VU 1 "s_register_operand" "w") 
 			    (match_operand:VU 2 "vect_par_constant_high" "")))
-		       (SE:<V_unpack> (vec_select:<V_HALF>
-			    (match_operand:VU 3 "register_operand" "w") 
+			 (SE:<V_unpack> (vec_select:<V_HALF>
+			    (match_operand:VU 3 "s_register_operand" "w") 
 			    (match_dup 2)))))]
   "TARGET_NEON"
-  "vmull.<US><V_sz_elem> %q0, %f1, %f3"
+  "vmull.<US><V_sz_elem>\t%q0, %f1, %f3"
   [(set_attr "neon_type" "neon_shift_1")]
 )
 
 (define_expand "vec_widen_<US>mult_hi_<mode>"
-  [(match_operand:<V_unpack> 0 "register_operand" "")
-   (SE:<V_unpack> (match_operand:VU 1 "register_operand" ""))
-   (SE:<V_unpack> (match_operand:VU 2 "register_operand" ""))]
- "TARGET_NEON"
- {
-   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
-   rtx t1;
-   int i;
-   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
-     RTVEC_ELT (v, i) = GEN_INT (<V_mode_nunits>/2 + i);
-   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
-
-   emit_insn (gen_neon_vec_<US>mult_hi_<mode> (operands[0],
- 					       operands[1],
-					       t1,
-					       operands[2]));
-   DONE;
-
- }
-)
+  [(match_operand:<V_unpack> 0 "s_register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "s_register_operand" ""))
+   (SE:<V_unpack> (match_operand:VU 2 "s_register_operand" ""))]
+  "TARGET_NEON"
+{
+  if (BYTES_BIG_ENDIAN)
+    {
+      rtx hi_1 = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, 8);
+      rtx hi_2 = simplify_gen_subreg (<V_HALF>mode, operands[2], <MODE>mode, 8);
+      rtx tmp = gen_reg_rtx (<V_unpack>mode);
+      rtx tmp2 = gen_reg_rtx (V2DImode);
+      rtx tmp3 = gen_reg_rtx (V2DImode);
+
+      emit_insn (gen_neon_vmull<V_half> (tmp, hi_1, hi_2,
+					 GEN_INT (<SE_magic>)));
+      neon_reinterpret (tmp2, tmp);
+      emit_insn (gen_neon_vextv2di (tmp3, tmp2, tmp2, GEN_INT (1)));
+      neon_reinterpret (operands[0], tmp3);
+    }
+  else
+    {
+      rtvec v = rtvec_alloc (<V_mode_nunits> / 2);
+      rtx t1;
+      int i;
+
+      for (i = 0; i < (<V_mode_nunits> / 2) ; i++)
+	RTVEC_ELT (v, i) = GEN_INT (<V_mode_nunits> / 2 + i);
+
+      t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+
+      emit_insn (gen_neon_vec_<US>mult_hi_<mode> (operands[0],
+ 						  operands[1],
+						  t1,
+						  operands[2]));
+    }
+  DONE;
+})
 
 ;; Vectorize for non-neon-quad case
 (define_insn "neon_unpack<US>_<mode>"
- [(set (match_operand:<V_widen> 0 "register_operand" "=w")
-       (SE:<V_widen> (match_operand:VDI 1 "register_operand" "w")))]
- "TARGET_NEON"
- "vmovl.<US><V_sz_elem> %q0, %P1"
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(SE:<V_widen> (match_operand:VDI 1 "s_register_operand" "w")))]
+  "TARGET_NEON"
+  "vmovl.<US><V_sz_elem> %q0, %P1"
   [(set_attr "neon_type" "neon_shift_1")]
 )
 
 (define_expand "vec_unpack<US>_lo_<mode>"
- [(match_operand:<V_double_width> 0 "register_operand" "")
-  (SE:<V_double_width>(match_operand:VDI 1 "register_operand"))]
- "TARGET_NEON"
+  [(match_operand:<V_double_width> 0 "s_register_operand" "")
+   (SE:<V_double_width>(match_operand:VDI 1 "s_register_operand" ""))]
+  "TARGET_NEON"
 {
-  rtx tmpreg = gen_reg_rtx (<V_widen>mode);
-  emit_insn (gen_neon_unpack<US>_<mode> (tmpreg, operands[1]));
-  emit_insn (gen_neon_vget_low<V_widen_l> (operands[0], tmpreg));
+  unsigned int subreg_offset = BYTES_BIG_ENDIAN ? 8 : 0;
+  rtx tmp = gen_reg_rtx (<V_widen>mode);
+
+  /* FIXME: currently combine.c performs an invalid optimization when canonical
+     RTL codes (sign_extend/zero_extend) are used in this case (together with
+     subregs).  Using neon_vmovl works around that.  */
+  if (1)
+    emit_insn (gen_neon_vmovl<mode> (tmp, operands[1], GEN_INT (<SE_magic>)));
+  else
+    emit_insn (gen_neon_unpack<US>_<mode> (tmp, operands[1]));
+
+  emit_move_insn (operands[0],
+		  simplify_gen_subreg (<V_double_width>mode, tmp,
+				       <V_widen>mode, subreg_offset));
 
   DONE;
-}
-)
+})
 
 (define_expand "vec_unpack<US>_hi_<mode>"
- [(match_operand:<V_double_width> 0 "register_operand" "")
-  (SE:<V_double_width>(match_operand:VDI 1 "register_operand"))]
- "TARGET_NEON"
+  [(match_operand:<V_double_width> 0 "s_register_operand" "")
+   (SE:<V_double_width> (match_operand:VDI 1 "s_register_operand" ""))]
+  "TARGET_NEON"
 {
-  rtx tmpreg = gen_reg_rtx (<V_widen>mode);
-  emit_insn (gen_neon_unpack<US>_<mode> (tmpreg, operands[1]));
-  emit_insn (gen_neon_vget_high<V_widen_l> (operands[0], tmpreg));
+  unsigned int subreg_offset = BYTES_BIG_ENDIAN ? 0 : 8;
+  rtx tmp = gen_reg_rtx (<V_widen>mode);
+
+  /* FIXME: See note in vec_unpack<US>_lo_<mode>.  */
+  if (1)
+    emit_insn (gen_neon_vmovl<mode> (tmp, operands[1], GEN_INT (<SE_magic>)));
+  else
+    emit_insn (gen_neon_unpack<US>_<mode> (tmp, operands[1]));
+
+  emit_move_insn (operands[0],
+		  simplify_gen_subreg (<V_double_width>mode, tmp,
+				       <V_widen>mode, subreg_offset));
 
   DONE;
-}
-)
+})
 
 (define_insn "neon_vec_<US>mult_<mode>"
- [(set (match_operand:<V_widen> 0 "register_operand" "=w")
-       (mult:<V_widen> (SE:<V_widen> 
-		 	   (match_operand:VDI 1 "register_operand" "w"))
- 		       (SE:<V_widen> 
-			   (match_operand:VDI 2 "register_operand" "w"))))]
+  [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+	(mult:<V_widen> (SE:<V_widen> 
+		 	   (match_operand:VDI 1 "s_register_operand" "w"))
+			(SE:<V_widen> 
+			   (match_operand:VDI 2 "s_register_operand" "w"))))]
   "TARGET_NEON"
   "vmull.<US><V_sz_elem> %q0, %P1, %P2"
   [(set_attr "neon_type" "neon_shift_1")]
 )
 
 (define_expand "vec_widen_<US>mult_hi_<mode>"
-  [(match_operand:<V_double_width> 0 "register_operand" "")
-   (SE:<V_double_width> (match_operand:VDI 1 "register_operand" ""))
-   (SE:<V_double_width> (match_operand:VDI 2 "register_operand" ""))]
- "TARGET_NEON"
- {
-   rtx tmpreg = gen_reg_rtx (<V_widen>mode);
-   emit_insn (gen_neon_vec_<US>mult_<mode> (tmpreg, operands[1], operands[2]));
-   emit_insn (gen_neon_vget_high<V_widen_l> (operands[0], tmpreg));
- 					    
-   DONE;
+  [(match_operand:<V_double_width> 0 "s_register_operand" "")
+   (SE:<V_double_width> (match_operand:VDI 1 "s_register_operand" ""))
+   (SE:<V_double_width> (match_operand:VDI 2 "s_register_operand" ""))]
+  "TARGET_NEON"
+{
+  unsigned int subreg_offset = BYTES_BIG_ENDIAN ? 0 : 8;
+  rtx tmp = gen_reg_rtx (<V_widen>mode);
+
+  /* FIXME: See note in vec_unpack<US>_lo_<mode>.  */
+  if (1)
+    emit_insn (gen_neon_vmull<mode> (tmp, operands[1], operands[2],
+				     GEN_INT (<SE_magic>)));
+  else
+    emit_insn (gen_neon_vec_<US>mult_<mode> (tmp, operands[1], operands[2]));
 
- }
-)
+  emit_move_insn (operands[0],
+		  simplify_gen_subreg (<V_double_width>mode, tmp,
+				       <V_widen>mode, subreg_offset));
+
+  DONE;
+})
 
 (define_expand "vec_widen_<US>mult_lo_<mode>"
-  [(match_operand:<V_double_width> 0 "register_operand" "")
-   (SE:<V_double_width> (match_operand:VDI 1 "register_operand" ""))
-   (SE:<V_double_width> (match_operand:VDI 2 "register_operand" ""))]
- "TARGET_NEON"
- {
-   rtx tmpreg = gen_reg_rtx (<V_widen>mode);
-   emit_insn (gen_neon_vec_<US>mult_<mode> (tmpreg, operands[1], operands[2]));
-   emit_insn (gen_neon_vget_low<V_widen_l> (operands[0], tmpreg));
- 					    
-   DONE;
-
- }
-)
-
-; FIXME: These instruction patterns can't be used safely in big-endian mode
-; because the ordering of vector elements in Q registers is different from what
-; the semantics of the instructions require.
+  [(match_operand:<V_double_width> 0 "s_register_operand" "")
+   (SE:<V_double_width> (match_operand:VDI 1 "s_register_operand" ""))
+   (SE:<V_double_width> (match_operand:VDI 2 "s_register_operand" ""))]
+  "TARGET_NEON"
+{
+  unsigned int subreg_offset = BYTES_BIG_ENDIAN ? 8 : 0;
+  rtx tmp = gen_reg_rtx (<V_widen>mode);
+
+  /* FIXME: See note in vec_unpack<US>_lo_<mode>.  */
+  if (1)
+    emit_insn (gen_neon_vmull<mode> (tmp, operands[1], operands[2],
+				     GEN_INT (<SE_magic>)));
+  else
+    emit_insn (gen_neon_vec_<US>mult_<mode> (tmp, operands[1], operands[2]));
+
+  emit_move_insn (operands[0],
+		  simplify_gen_subreg (<V_double_width>mode, tmp,
+				       <V_widen>mode, subreg_offset));
+
+  DONE;
+})
 
-(define_insn "vec_pack_trunc_<mode>"
-  [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
+(define_expand "vec_pack_trunc_<mode>"
+  [(set (match_operand:<V_narrow_pack> 0 "s_register_operand" "")
 	(vec_concat:<V_narrow_pack> 
 		(truncate:<V_narrow> 
-			(match_operand:VN 1 "register_operand" "w"))
+			(match_operand:VN 1 "s_register_operand" ""))
 		(truncate:<V_narrow>
-			(match_operand:VN 2 "register_operand" "w"))))]
- "TARGET_NEON && !BYTES_BIG_ENDIAN"
- "vmovn.i<V_sz_elem>\t%e0, %q1\;vmovn.i<V_sz_elem>\t%f0, %q2"
- [(set_attr "neon_type" "neon_shift_1")
-  (set_attr "length" "8")]
-)
+			(match_operand:VN 2 "s_register_operand" ""))))]
+ "TARGET_NEON"
+{
+  rtx tmp;
+  rtx destlo = simplify_gen_subreg (<V_narrow>mode, operands[0],
+				    <V_narrow_pack>mode, 0);
+  rtx desthi = simplify_gen_subreg (<V_narrow>mode, operands[0],
+				    <V_narrow_pack>mode, 8);
+
+  /* Operand 0 may overlap with operand 2, and the former is clobbered early.
+     Make a copy of the latter if necessary.  */
+
+  if (reg_overlap_mentioned_p (operands[0], operands[2]))
+    tmp = copy_to_reg (operands[2]);
+  else
+    tmp = operands[2];
+
+  if (BYTES_BIG_ENDIAN)
+    {
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      rtx rev = gen_reg_rtx (V4SImode);
+      
+      emit_insn (gen_neon_vmovn<mode> (destlo, operands[1], const0_rtx));
+      emit_insn (gen_neon_vmovn<mode> (desthi, tmp, const0_rtx));
+      neon_reinterpret (tmp2, operands[0]);
+      emit_insn (gen_neon_vrev64v4si (rev, tmp2, const0_rtx));
+      neon_reinterpret (operands[0], rev);
+    }
+  else
+    {
+      emit_insn (gen_neon_vec_pack_trunc_<mode> (destlo, operands[1]));
+      emit_insn (gen_neon_vec_pack_trunc_<mode> (desthi, tmp));
+    }
+
+  DONE;
+})
 
 ; For the non-quad case.
+
+; Note that the "truncate" opcode shouldn't be used in big-endian mode, since
+; the relative orderings of quad-word and double-word vectors differ: use of
+; truncate would not imply the necessary permutation. Use the "opaque"
+; neon_vmovn<mode> pattern in big-endian mode.
+
 (define_insn "neon_vec_pack_trunc_<mode>"
- [(set (match_operand:<V_narrow> 0 "register_operand" "=w")
-       (truncate:<V_narrow> (match_operand:VN 1 "register_operand" "w")))]
- "TARGET_NEON && !BYTES_BIG_ENDIAN"
- "vmovn.i<V_sz_elem>\t%P0, %q1"
- [(set_attr "neon_type" "neon_shift_1")]
+  [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+	(truncate:<V_narrow> (match_operand:VN 1 "s_register_operand" "w")))]
+  "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  "vmovn.i<V_sz_elem>\t%P0, %q1"
+  [(set_attr "neon_type" "neon_shift_1")]
 )
 
 (define_expand "vec_pack_trunc_<mode>"
- [(match_operand:<V_narrow_pack> 0 "register_operand" "")
-  (match_operand:VSHFT 1 "register_operand" "")
-  (match_operand:VSHFT 2 "register_operand" "")]
- "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  [(match_operand:<V_narrow_pack> 0 "s_register_operand" "")
+   (match_operand:VSHFT 1 "s_register_operand" "")
+   (match_operand:VSHFT 2 "s_register_operand" "")]
+  "TARGET_NEON"
 {
   rtx tempreg = gen_reg_rtx (<V_DOUBLE>mode);
   
-  emit_insn (gen_move_lo_quad_<V_double> (tempreg, operands[1]));
-  emit_insn (gen_move_hi_quad_<V_double> (tempreg, operands[2]));
-  emit_insn (gen_neon_vec_pack_trunc_<V_double> (operands[0], tempreg));
+  if (BYTES_BIG_ENDIAN)
+    {
+      /* An open-coded "permute", to switch the order of the hi/lo double-words
+         constituting the input operands of the instruction.  */
+      emit_insn (gen_move_lo_quad_<V_double> (tempreg, operands[2]));
+      emit_insn (gen_move_hi_quad_<V_double> (tempreg, operands[1]));
+      emit_insn (gen_neon_vmovn<V_double> (operands[0], tempreg, const0_rtx));
+    }
+  else
+    {
+      emit_insn (gen_move_lo_quad_<V_double> (tempreg, operands[1]));
+      emit_insn (gen_move_hi_quad_<V_double> (tempreg, operands[2]));
+      emit_insn (gen_neon_vec_pack_trunc_<V_double> (operands[0], tempreg));
+    }
   DONE;
 })